summaryrefslogtreecommitdiffstats
path: root/vendor/ucd-parse
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /vendor/ucd-parse
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/ucd-parse')
-rw-r--r--vendor/ucd-parse/.cargo-checksum.json1
-rw-r--r--vendor/ucd-parse/Cargo.toml31
-rw-r--r--vendor/ucd-parse/LICENSE-APACHE201
-rw-r--r--vendor/ucd-parse/LICENSE-MIT21
-rw-r--r--vendor/ucd-parse/README.md22
-rw-r--r--vendor/ucd-parse/src/age.rs59
-rw-r--r--vendor/ucd-parse/src/arabic_shaping.rs184
-rw-r--r--vendor/ucd-parse/src/bidi_mirroring_glyph.rs107
-rw-r--r--vendor/ucd-parse/src/case_folding.rs161
-rw-r--r--vendor/ucd-parse/src/common.rs594
-rw-r--r--vendor/ucd-parse/src/core_properties.rs60
-rw-r--r--vendor/ucd-parse/src/emoji_properties.rs86
-rw-r--r--vendor/ucd-parse/src/error.rs86
-rw-r--r--vendor/ucd-parse/src/grapheme_cluster_break.rs98
-rw-r--r--vendor/ucd-parse/src/jamo_short_name.rs80
-rw-r--r--vendor/ucd-parse/src/lib.rs66
-rw-r--r--vendor/ucd-parse/src/line_break.rs49
-rw-r--r--vendor/ucd-parse/src/name_aliases.rs145
-rw-r--r--vendor/ucd-parse/src/prop_list.rs63
-rw-r--r--vendor/ucd-parse/src/property_aliases.rs113
-rw-r--r--vendor/ucd-parse/src/property_value_aliases.rs185
-rw-r--r--vendor/ucd-parse/src/script_extensions.rs68
-rw-r--r--vendor/ucd-parse/src/scripts.rs59
-rw-r--r--vendor/ucd-parse/src/sentence_break.rs101
-rw-r--r--vendor/ucd-parse/src/special_casing.rs112
-rw-r--r--vendor/ucd-parse/src/unicode_data.rs787
-rw-r--r--vendor/ucd-parse/src/word_break.rs103
27 files changed, 3642 insertions, 0 deletions
diff --git a/vendor/ucd-parse/.cargo-checksum.json b/vendor/ucd-parse/.cargo-checksum.json
new file mode 100644
index 000000000..34cd1d5b5
--- /dev/null
+++ b/vendor/ucd-parse/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"Cargo.toml":"3a23e75f3807a38f86e8564a139135970f38c9ebc448749682b75fd4096f6d4a","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"5af803e482641f01332bde35cc8137211714b6f100122ec548c9712a09aead55","src/age.rs":"13a9a01b2373e9eff06b547543479394843cb9103c200b3e666ca5e408369bc9","src/arabic_shaping.rs":"31075e05b33248540f10ae5a3bb14715965e109b2be40cd9c0735810903ce29b","src/bidi_mirroring_glyph.rs":"945a110e0f54eabc2f48719893da300c11b4fd1f28265ab8f7b32ce2e5e3f6e5","src/case_folding.rs":"1ec85e0fa8e8cb0315974b843d168d9cddecad40efcf8ce78de677c0f0417f34","src/common.rs":"40782238affb569c9bd89a7ce19202677ba3e1da0bb5c8f8c4439adaa375858b","src/core_properties.rs":"24b261ed0bc4b7443734d843cda58433c7727914524ac4c3cc46fc153463e8cd","src/emoji_properties.rs":"bdb24a301661592d0956db2ad945a86778e0ad8f86cd82077835bb0d2a4f144c","src/error.rs":"6df32d4c5cc9819832083f465aa4ce11d26d3b44e37a9d4274a45fd8e1314903","src/grapheme_cluster_break.rs":"f63f75f1a5a82b698d4a840b063bc650f2b2f64429830dc338c9723bf1368e0b","src/jamo_short_name.rs":"02dc272c1a7d01de5e22737a76327b94ae2d132703dbc0657e3e887ceb1d1d91","src/lib.rs":"894ecd08e4588e14de69e8b9d25e9a38e9e2f73e260855c99df13c2ee1d825d3","src/line_break.rs":"1def7f73d44c1703fd18dbd9c9fc8dd76edabed27a5061564d6521d59335a95c","src/name_aliases.rs":"497629a0499d048b0f1615c619975f149c6a1d97361b7ff16850a8291796c90d","src/prop_list.rs":"856f00f51e7e6b9b0386a9b3941582eba63eb96896c86e58a791384a1235fdec","src/property_aliases.rs":"7b6da97e45a898499f29e30346f1b7aa6b7d758184a3bfa4f0b816d20edc9851","src/property_value_aliases.rs":"4e9fbad2b32ad636e5f8dfefa082352e444e4a68822a7786ea7d4217e7afd2fb","src/script_extensions.rs":"d967e213122702df642c975765fec28811ae8351f6f5307ca67989bf0b456fba","src/scripts.rs":"04740c080bb48e99d84622e4708215b40abdd387c70347d6b264b9c7fcbbac37","src/sentence_break.rs":"ac54a7f09f75694582904509d979c61784fa1ec647e4d531ea1b283bc3082635","src/special_casing.rs":"de7ed50ec34a222c73e8ad6d82a2a658b4475ce312301c5110d07fa13e51cb0b","src/unicode_data.rs":"cad99e17c6d56c9029416a0f3ec1b469786864eace2a20f212f2b4a1c96b59f1","src/word_break.rs":"eea514f238dc9dea82f52efc3154fde3f215b068dd201b22c31ef1c0acf1fba3"},"package":"5269f8d35df6b8b60758343a6d742ecf09e4bca13faee32af5503aebd1e11b7c"} \ No newline at end of file
diff --git a/vendor/ucd-parse/Cargo.toml b/vendor/ucd-parse/Cargo.toml
new file mode 100644
index 000000000..f7efa0970
--- /dev/null
+++ b/vendor/ucd-parse/Cargo.toml
@@ -0,0 +1,31 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies
+#
+# If you believe there's an error in this file please file an
+# issue against the rust-lang/cargo repository. If you're
+# editing this file be aware that the upstream Cargo.toml
+# will likely look very different (and much more reasonable)
+
+[package]
+edition = "2018"
+name = "ucd-parse"
+version = "0.1.8"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = "A library for parsing data files in the Unicode character database.\n"
+homepage = "https://github.com/BurntSushi/ucd-generate"
+documentation = "https://docs.rs/ucd-parse"
+readme = "README.md"
+keywords = ["unicode", "database", "character", "property"]
+license = "MIT/Apache-2.0"
+repository = "https://github.com/BurntSushi/ucd-generate"
+[dependencies.lazy_static]
+version = "1"
+
+[dependencies.regex]
+version = "1"
+features = ["std", "unicode"]
+default-features = false
diff --git a/vendor/ucd-parse/LICENSE-APACHE b/vendor/ucd-parse/LICENSE-APACHE
new file mode 100644
index 000000000..16fe87b06
--- /dev/null
+++ b/vendor/ucd-parse/LICENSE-APACHE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/vendor/ucd-parse/LICENSE-MIT b/vendor/ucd-parse/LICENSE-MIT
new file mode 100644
index 000000000..3b0a5dc09
--- /dev/null
+++ b/vendor/ucd-parse/LICENSE-MIT
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Andrew Gallant
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/vendor/ucd-parse/README.md b/vendor/ucd-parse/README.md
new file mode 100644
index 000000000..dc3f78dc6
--- /dev/null
+++ b/vendor/ucd-parse/README.md
@@ -0,0 +1,22 @@
+ucd-parse
+=========
+A library for parsing Unicode Character Database (UCD) files into structured
+data.
+
+[![Linux build status](https://api.travis-ci.org/BurntSushi/ucd-generate.png)](https://travis-ci.org/BurntSushi/ucd-generate)
+[![](http://meritbadge.herokuapp.com/ucd-generate)](https://crates.io/crates/ucd-parse)
+
+
+### Documentation
+
+https://docs.rs/ucd-parse
+
+
+### License
+
+This project is licensed under either of
+ * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
+ http://www.apache.org/licenses/LICENSE-2.0)
+ * MIT license ([LICENSE-MIT](LICENSE-MIT) or
+ http://opensource.org/licenses/MIT)
+at your option.
diff --git a/vendor/ucd-parse/src/age.rs b/vendor/ucd-parse/src/age.rs
new file mode 100644
index 000000000..3c93f0707
--- /dev/null
+++ b/vendor/ucd-parse/src/age.rs
@@ -0,0 +1,59 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+ parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+ UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `DerivedAge.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Age {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The age assigned to the codepoints in this entry.
+ pub age: String,
+}
+
+impl UcdFile for Age {
+ fn relative_file_path() -> &'static Path {
+ Path::new("DerivedAge.txt")
+ }
+}
+
+impl UcdFileByCodepoint for Age {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for Age {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<Age, Error> {
+ let (codepoints, script) = parse_codepoint_association(line)?;
+ Ok(Age { codepoints, age: script.to_string() })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Age;
+
+ #[test]
+ fn parse_single() {
+ let line = "2BD2 ; 10.0 # GROUP MARK\n";
+ let row: Age = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x2BD2);
+ assert_eq!(row.age, "10.0");
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "11D0B..11D36 ; 10.0 # [44] MASARAM GONDI LETTER AU..MASARAM GONDI VOWEL SIGN VOCALIC R\n";
+ let row: Age = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0x11D0B, 0x11D36));
+ assert_eq!(row.age, "10.0");
+ }
+}
diff --git a/vendor/ucd-parse/src/arabic_shaping.rs b/vendor/ucd-parse/src/arabic_shaping.rs
new file mode 100644
index 000000000..d1d942a82
--- /dev/null
+++ b/vendor/ucd-parse/src/arabic_shaping.rs
@@ -0,0 +1,184 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// Represents a single row in the `ArabicShaping.txt` file.
+///
+/// The field names were taken from the header of ArabicShaping.txt.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct ArabicShaping {
+ /// The codepoint corresponding to this row.
+ pub codepoint: Codepoint,
+ /// A short schematic name for the codepoint.
+ ///
+ /// The schematic name is descriptive of the shape, based as consistently as
+ /// possible on a name for the skeleton and then the diacritic marks applied
+ /// to the skeleton, if any. Note that this schematic name is considered a
+ /// comment, and does not constitute a formal property value.
+ pub schematic_name: String,
+ /// The "joining type" of this codepoint.
+ pub joining_type: JoiningType,
+ /// The "joining group" of this codepoint.
+ pub joining_group: String,
+}
+
+/// The Joining_Type field read from ArabicShaping.txt
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum JoiningType {
+ RightJoining,
+ LeftJoining,
+ DualJoining,
+ JoinCausing,
+ NonJoining,
+ Transparent,
+}
+
+impl JoiningType {
+ pub fn as_str(&self) -> &str {
+ match self {
+ JoiningType::RightJoining => "R",
+ JoiningType::LeftJoining => "L",
+ JoiningType::DualJoining => "D",
+ JoiningType::JoinCausing => "C",
+ JoiningType::NonJoining => "U",
+ JoiningType::Transparent => "T",
+ }
+ }
+}
+
+impl Default for JoiningType {
+ fn default() -> JoiningType {
+ JoiningType::NonJoining
+ }
+}
+
+impl FromStr for JoiningType {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<JoiningType, Error> {
+ match s {
+ "R" => Ok(JoiningType::RightJoining),
+ "L" => Ok(JoiningType::LeftJoining),
+ "D" => Ok(JoiningType::DualJoining),
+ "C" => Ok(JoiningType::JoinCausing),
+ "U" => Ok(JoiningType::NonJoining),
+ "T" => Ok(JoiningType::Transparent),
+ _ => err!(
+ "unrecognized joining type: '{}' \
+ (must be one of R, L, D, C, U or T)",
+ s
+ ),
+ }
+ }
+}
+
+impl UcdFile for ArabicShaping {
+ fn relative_file_path() -> &'static Path {
+ Path::new("ArabicShaping.txt")
+ }
+}
+
+impl UcdFileByCodepoint for ArabicShaping {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoint.into_iter()
+ }
+}
+
+impl FromStr for ArabicShaping {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<ArabicShaping, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ \s*(?P<codepoint>[A-F0-9]+)\s*;
+ \s*(?P<name>[^;]+)\s*;
+ \s*(?P<joining_type>[^;]+)\s*;
+ \s*(?P<joining_group>[^;]+)
+ $
+ "
+ )
+ .unwrap();
+ };
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid ArabicShaping line"),
+ };
+
+ Ok(ArabicShaping {
+ codepoint: caps["codepoint"].parse()?,
+ schematic_name: caps["name"].to_string(),
+ joining_type: caps["joining_type"].parse()?,
+ joining_group: caps["joining_group"].to_string(),
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::common::Codepoint;
+
+ use super::{ArabicShaping, JoiningType};
+
+ fn codepoint(n: u32) -> Codepoint {
+ Codepoint::from_u32(n).unwrap()
+ }
+
+ fn s(string: &str) -> String {
+ string.to_string()
+ }
+
+ #[test]
+ fn parse1() {
+ let line = "0600; ARABIC NUMBER SIGN; U; No_Joining_Group\n";
+ let data: ArabicShaping = line.parse().unwrap();
+ assert_eq!(
+ data,
+ ArabicShaping {
+ codepoint: codepoint(0x0600),
+ schematic_name: s("ARABIC NUMBER SIGN"),
+ joining_type: JoiningType::NonJoining,
+ joining_group: s("No_Joining_Group")
+ }
+ );
+ }
+
+ #[test]
+ fn parse2() {
+ let line = "063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH\n";
+ let data: ArabicShaping = line.parse().unwrap();
+ assert_eq!(
+ data,
+ ArabicShaping {
+ codepoint: codepoint(0x063D),
+ schematic_name: s("FARSI YEH WITH INVERTED V ABOVE"),
+ joining_type: JoiningType::DualJoining,
+ joining_group: s("FARSI YEH")
+ }
+ );
+ }
+
+ #[test]
+ fn parse3() {
+ let line =
+ "10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA\n";
+ let data: ArabicShaping = line.parse().unwrap();
+ assert_eq!(
+ data,
+ ArabicShaping {
+ codepoint: codepoint(0x10D23),
+ schematic_name: s(
+ "HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE"
+ ),
+ joining_type: JoiningType::DualJoining,
+ joining_group: s("HANIFI ROHINGYA KINNA YA")
+ }
+ );
+ }
+}
diff --git a/vendor/ucd-parse/src/bidi_mirroring_glyph.rs b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs
new file mode 100644
index 000000000..fcfefffcb
--- /dev/null
+++ b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs
@@ -0,0 +1,107 @@
+use std::fmt;
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// Represents a single row in the `BidiMirroring.txt` file.
+///
+/// The field names were taken from the header of BidiMirroring.txt.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct BidiMirroring {
+ /// The codepoint corresponding to this row.
+ pub codepoint: Codepoint,
+ /// The codepoint that has typically has a glyph that is the mirror image
+ /// of `codepoint`.
+ pub bidi_mirroring_glyph: Codepoint,
+}
+
+impl UcdFile for BidiMirroring {
+ fn relative_file_path() -> &'static Path {
+ Path::new("BidiMirroring.txt")
+ }
+}
+
+impl UcdFileByCodepoint for BidiMirroring {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoint.into_iter()
+ }
+}
+
+impl FromStr for BidiMirroring {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<BidiMirroring, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ \s*(?P<codepoint>[A-F0-9]+)\s*;
+ \s*(?P<substitute_codepoint>[A-F0-9]+)
+ \s+
+ \#(?:.+)
+ $
+ "
+ )
+ .unwrap();
+ };
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid BidiMirroring line"),
+ };
+
+ Ok(BidiMirroring {
+ codepoint: caps["codepoint"].parse()?,
+ bidi_mirroring_glyph: caps["substitute_codepoint"].parse()?,
+ })
+ }
+}
+
+impl fmt::Display for BidiMirroring {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{};", self.codepoint)?;
+ write!(f, "{};", self.bidi_mirroring_glyph)?;
+ Ok(())
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::common::Codepoint;
+
+ use super::BidiMirroring;
+
+ fn codepoint(n: u32) -> Codepoint {
+ Codepoint::from_u32(n).unwrap()
+ }
+
+ #[test]
+ fn parse() {
+ let line = "0028; 0029 # LEFT PARENTHESIS\n";
+ let data: BidiMirroring = line.parse().unwrap();
+ assert_eq!(
+ data,
+ BidiMirroring {
+ codepoint: codepoint(0x0028),
+ bidi_mirroring_glyph: codepoint(0x0029),
+ }
+ );
+ }
+
+ #[test]
+ fn parse_best_fit() {
+ let line = "228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO\n";
+ let data: BidiMirroring = line.parse().unwrap();
+ assert_eq!(
+ data,
+ BidiMirroring {
+ codepoint: codepoint(0x228A),
+ bidi_mirroring_glyph: codepoint(0x228B),
+ }
+ );
+ }
+}
diff --git a/vendor/ucd-parse/src/case_folding.rs b/vendor/ucd-parse/src/case_folding.rs
new file mode 100644
index 000000000..813fc81a1
--- /dev/null
+++ b/vendor/ucd-parse/src/case_folding.rs
@@ -0,0 +1,161 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// A single row in the `CaseFolding.txt` file.
+///
+/// The contents of `CaseFolding.txt` are a convenience derived from both
+/// `UnicodeData.txt` and `SpecialCasing.txt`.
+///
+/// Note that a single codepoint may be mapped multiple times. In particular,
+/// a single codepoint might have distinct `CaseStatus::Simple` and
+/// `CaseStatus::Full` mappings.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct CaseFold {
+ /// The codepoint that is being mapped.
+ pub codepoint: Codepoint,
+ /// The case status of this mapping.
+ pub status: CaseStatus,
+ /// The actual case mapping, which is more than one codepoint if this is
+ /// a "full" mapping.
+ pub mapping: Vec<Codepoint>,
+}
+
+impl UcdFile for CaseFold {
+ fn relative_file_path() -> &'static Path {
+ Path::new("CaseFolding.txt")
+ }
+}
+
+impl UcdFileByCodepoint for CaseFold {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoint.into_iter()
+ }
+}
+
+impl FromStr for CaseFold {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<CaseFold, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ \s*(?P<codepoint>[^\s;]+)\s*;
+ \s*(?P<status>[^\s;]+)\s*;
+ \s*(?P<mapping>[^;]+)\s*;
+ "
+ )
+ .unwrap();
+ };
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid CaseFolding line: '{}'", line),
+ };
+ let mut mapping = vec![];
+ for cp in caps["mapping"].split_whitespace() {
+ mapping.push(cp.parse()?);
+ }
+ Ok(CaseFold {
+ codepoint: caps["codepoint"].parse()?,
+ status: caps["status"].parse()?,
+ mapping,
+ })
+ }
+}
+
+/// The status of a particular case mapping.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum CaseStatus {
+ /// Case mappings shared by both "simple" and "full" mappings.
+ Common,
+ /// A case mapping that changes the number of codepoints.
+ Full,
+ /// A case mapping that doesn't change the number of codepoints, when it
+ /// differs from `Full`.
+ Simple,
+ /// Special cases (currently only for Turkic mappings) that are typically
+ /// excluded by default. Special cases don't change the number of
+ /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
+ Special,
+}
+
+impl Default for CaseStatus {
+ fn default() -> CaseStatus {
+ CaseStatus::Common
+ }
+}
+
+impl CaseStatus {
+ /// Returns true if and only if this status indicates a case mapping that
+ /// won't change the number of codepoints.
+ pub fn is_fixed(&self) -> bool {
+ *self != CaseStatus::Full
+ }
+}
+
+impl FromStr for CaseStatus {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<CaseStatus, Error> {
+ match s {
+ "C" => Ok(CaseStatus::Common),
+ "F" => Ok(CaseStatus::Full),
+ "S" => Ok(CaseStatus::Simple),
+ "T" => Ok(CaseStatus::Special),
+ _ => err!(
+ "unrecognized case status: '{}' \
+ (must be one of C, F, S or T)",
+ s
+ ),
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{CaseFold, CaseStatus};
+
+ #[test]
+ fn parse_common() {
+ let line =
+ "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n";
+ let row: CaseFold = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x0150);
+ assert_eq!(row.status, CaseStatus::Common);
+ assert_eq!(row.mapping, vec![0x0151]);
+ }
+
+ #[test]
+ fn parse_full() {
+ let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n";
+ let row: CaseFold = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x03B0);
+ assert_eq!(row.status, CaseStatus::Full);
+ assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]);
+ }
+
+ #[test]
+ fn parse_simple() {
+ let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n";
+ let row: CaseFold = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x1F8F);
+ assert_eq!(row.status, CaseStatus::Simple);
+ assert_eq!(row.mapping, vec![0x1F87]);
+ }
+
+ #[test]
+ fn parse_special() {
+ let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n";
+ let row: CaseFold = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x0049);
+ assert_eq!(row.status, CaseStatus::Special);
+ assert_eq!(row.mapping, vec![0x0131]);
+ }
+}
diff --git a/vendor/ucd-parse/src/common.rs b/vendor/ucd-parse/src/common.rs
new file mode 100644
index 000000000..c18be668e
--- /dev/null
+++ b/vendor/ucd-parse/src/common.rs
@@ -0,0 +1,594 @@
+use std::char;
+use std::collections::BTreeMap;
+use std::fmt;
+use std::fs::File;
+use std::io::{self, BufRead};
+use std::marker::PhantomData;
+use std::path::{Path, PathBuf};
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::error::{Error, ErrorKind};
+
+/// Parse a particular file in the UCD into a sequence of rows.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error>
+where
+ P: AsRef<Path>,
+ D: UcdFile,
+{
+ let mut xs = vec![];
+ for result in D::from_dir(ucd_dir)? {
+ let x = result?;
+ xs.push(x);
+ }
+ Ok(xs)
+}
+
+/// Parse a particular file in the UCD into a map from codepoint to the record.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse_by_codepoint<P, D>(
+ ucd_dir: P,
+) -> Result<BTreeMap<Codepoint, D>, Error>
+where
+ P: AsRef<Path>,
+ D: UcdFileByCodepoint,
+{
+ let mut map = BTreeMap::new();
+ for result in D::from_dir(ucd_dir)? {
+ let x = result?;
+ for cp in x.codepoints() {
+ map.insert(cp, x.clone());
+ }
+ }
+ Ok(map)
+}
+
+/// Parse a particular file in the UCD into a map from codepoint to all
+/// records associated with that codepoint.
+///
+/// This is useful for files that have multiple records for each codepoint.
+/// For example, the `NameAliases.txt` file lists multiple aliases for some
+/// codepoints.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse_many_by_codepoint<P, D>(
+ ucd_dir: P,
+) -> Result<BTreeMap<Codepoint, Vec<D>>, Error>
+where
+ P: AsRef<Path>,
+ D: UcdFileByCodepoint,
+{
+ let mut map = BTreeMap::new();
+ for result in D::from_dir(ucd_dir)? {
+ let x = result?;
+ for cp in x.codepoints() {
+ map.entry(cp).or_insert(vec![]).push(x.clone());
+ }
+ }
+ Ok(map)
+}
+
+/// Given a path pointing at the root of the `ucd_dir`, attempts to determine
+/// it's unicode version.
+///
+/// This just checks the readme and the very first line of PropList.txt -- in
+/// practice this works for all versions of UCD since 4.1.0.
+pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>(
+ ucd_dir: &D,
+) -> Result<(u64, u64, u64), Error> {
+ // Avoid duplication from generic path parameter.
+ fn ucd_directory_version_inner(
+ ucd_dir: &Path,
+ ) -> Result<(u64, u64, u64), Error> {
+ lazy_static::lazy_static! {
+ static ref VERSION_RX: Regex =
+ Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap();
+ }
+
+ let proplist = ucd_dir.join("PropList.txt");
+ let contents = first_line(&proplist)?;
+ let caps = match VERSION_RX.captures(&contents) {
+ Some(c) => c,
+ None => {
+ return err!("Failed to find version in line {:?}", contents)
+ }
+ };
+
+ let capture_to_num = |n| {
+ caps.get(n).unwrap().as_str().parse::<u64>().map_err(|e| Error {
+ kind: ErrorKind::Parse(format!(
+ "Failed to parse version from {:?} in PropList.txt: {}",
+ contents, e
+ )),
+ line: Some(0),
+ path: Some(proplist.clone()),
+ })
+ };
+ let major = capture_to_num(1)?;
+ let minor = capture_to_num(2)?;
+ let patch = capture_to_num(3)?;
+
+ Ok((major, minor, patch))
+ }
+ ucd_directory_version_inner(ucd_dir.as_ref())
+}
+
+fn first_line(path: &Path) -> Result<String, Error> {
+ let file = std::fs::File::open(path).map_err(|e| Error {
+ kind: ErrorKind::Io(e),
+ line: None,
+ path: Some(path.into()),
+ })?;
+
+ let mut reader = std::io::BufReader::new(file);
+ let mut line_contents = String::new();
+ reader.read_line(&mut line_contents).map_err(|e| Error {
+ kind: ErrorKind::Io(e),
+ line: None,
+ path: Some(path.into()),
+ })?;
+ Ok(line_contents)
+}
+
+/// A helper function for parsing a common record format that associates one
+/// or more codepoints with a string value.
+pub fn parse_codepoint_association<'a>(
+ line: &'a str,
+) -> Result<(Codepoints, &'a str), Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ \s*(?P<codepoints>[^\s;]+)\s*;
+ \s*(?P<property>[^;\x23]+)\s*
+ "
+ )
+ .unwrap();
+ };
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid PropList line: '{}'", line),
+ };
+ let property = match caps.name("property") {
+ Some(property) => property.as_str().trim(),
+ None => {
+ return err!(
+ "could not find property name in PropList line: '{}'",
+ line
+ )
+ }
+ };
+ Ok((caps["codepoints"].parse()?, property))
+}
+
+/// A helper function for parsing a sequence of space separated codepoints.
+/// The sequence is permitted to be empty.
+pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> {
+ let mut cps = vec![];
+ for cp in s.trim().split_whitespace() {
+ cps.push(cp.parse()?);
+ }
+ Ok(cps)
+}
+
+/// A helper function for parsing a single test for the various break
+/// algorithms.
+///
+/// Upon success, this returns the UTF-8 encoded groups of codepoints along
+/// with the comment associated with the test. The comment is a human readable
+/// description of the test that may prove useful for debugging.
+pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ (?:÷|×)
+ (?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+)
+ \s+
+ \#(?P<comment>.+)
+ $
+ "
+ )
+ .unwrap();
+ static ref GROUP: Regex = Regex::new(
+ r"(?x)
+ (?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷|×)
+ "
+ )
+ .unwrap();
+ }
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid break test line: '{}'", line),
+ };
+ let comment = caps["comment"].trim().to_string();
+
+ let mut groups = vec![];
+ let mut cur = String::new();
+ for cap in GROUP.captures_iter(&caps["groups"]) {
+ let cp: Codepoint = cap["codepoint"].parse()?;
+ let ch = match cp.scalar() {
+ Some(ch) => ch,
+ None => {
+ return err!(
+ "invalid codepoint '{:X}' in line: '{}'",
+ cp.value(),
+ line
+ )
+ }
+ };
+ cur.push(ch);
+ if &cap["kind"] == "÷" {
+ groups.push(cur);
+ cur = String::new();
+ }
+ }
+ Ok((groups, comment))
+}
+
+/// Describes a single UCD file.
+pub trait UcdFile:
+ Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq
+{
+ /// The file path corresponding to this file, relative to the UCD
+ /// directory.
+ fn relative_file_path() -> &'static Path;
+
+ /// The full file path corresponding to this file given the UCD directory
+ /// path.
+ fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
+ ucd_dir.as_ref().join(Self::relative_file_path())
+ }
+
+ /// Create an iterator over each record in this UCD file.
+ ///
+ /// The parameter should correspond to the directory containing the UCD.
+ fn from_dir<P: AsRef<Path>>(
+ ucd_dir: P,
+ ) -> Result<UcdLineParser<File, Self>, Error> {
+ UcdLineParser::from_path(Self::file_path(ucd_dir))
+ }
+}
+
+/// Describes a single UCD file where every record in the file is associated
+/// with one or more codepoints.
+pub trait UcdFileByCodepoint: UcdFile {
+ /// Returns the codepoints associated with this record.
+ fn codepoints(&self) -> CodepointIter;
+}
+
+/// A line oriented parser for a particular UCD file.
+///
+/// Callers can build a line parser via the
+/// [`UcdFile::from_dir`](trait.UcdFile.html) method.
+///
+/// The `R` type parameter refers to the underlying `io::Read` implementation
+/// from which the UCD data is read.
+///
+/// The `D` type parameter refers to the type of the record parsed out of each
+/// line.
+#[derive(Debug)]
+pub struct UcdLineParser<R, D> {
+ path: Option<PathBuf>,
+ rdr: io::BufReader<R>,
+ line: String,
+ line_number: u64,
+ _data: PhantomData<D>,
+}
+
+impl<D> UcdLineParser<File, D> {
+ /// Create a new parser from the given file path.
+ pub(crate) fn from_path<P: AsRef<Path>>(
+ path: P,
+ ) -> Result<UcdLineParser<File, D>, Error> {
+ let path = path.as_ref();
+ let file = File::open(path).map_err(|e| Error {
+ kind: ErrorKind::Io(e),
+ line: None,
+ path: Some(path.to_path_buf()),
+ })?;
+ Ok(UcdLineParser::new(Some(path.to_path_buf()), file))
+ }
+}
+
+impl<R: io::Read, D> UcdLineParser<R, D> {
+ /// Create a new parser that parses the reader given.
+ ///
+ /// The type of data parsed is determined when the `parse_next` function
+ /// is called by virtue of the type requested.
+ ///
+ /// Note that the reader is buffered internally, so the caller does not
+ /// need to provide their own buffering.
+ pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> {
+ UcdLineParser {
+ path,
+ rdr: io::BufReader::new(rdr),
+ line: String::new(),
+ line_number: 0,
+ _data: PhantomData,
+ }
+ }
+}
+
+impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> {
+ type Item = Result<D, Error>;
+
+ fn next(&mut self) -> Option<Result<D, Error>> {
+ loop {
+ self.line_number += 1;
+ self.line.clear();
+ let n = match self.rdr.read_line(&mut self.line) {
+ Err(err) => {
+ return Some(Err(Error {
+ kind: ErrorKind::Io(err),
+ line: None,
+ path: self.path.clone(),
+ }))
+ }
+ Ok(n) => n,
+ };
+ if n == 0 {
+ return None;
+ }
+ if !self.line.starts_with('#') && !self.line.trim().is_empty() {
+ break;
+ }
+ }
+ let line_number = self.line_number;
+ Some(self.line.parse().map_err(|mut err: Error| {
+ err.line = Some(line_number);
+ err
+ }))
+ }
+}
+
+/// A representation of either a single codepoint or a range of codepoints.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub enum Codepoints {
+ /// A single codepoint.
+ Single(Codepoint),
+ /// A range of codepoints.
+ Range(CodepointRange),
+}
+
+impl Default for Codepoints {
+ fn default() -> Codepoints {
+ Codepoints::Single(Codepoint::default())
+ }
+}
+
+impl IntoIterator for Codepoints {
+ type IntoIter = CodepointIter;
+ type Item = Codepoint;
+
+ fn into_iter(self) -> CodepointIter {
+ match self {
+ Codepoints::Single(x) => x.into_iter(),
+ Codepoints::Range(x) => x.into_iter(),
+ }
+ }
+}
+
+impl FromStr for Codepoints {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<Codepoints, Error> {
+ if s.contains("..") {
+ CodepointRange::from_str(s).map(Codepoints::Range)
+ } else {
+ Codepoint::from_str(s).map(Codepoints::Single)
+ }
+ }
+}
+
+impl fmt::Display for Codepoints {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ Codepoints::Single(ref x) => x.fmt(f),
+ Codepoints::Range(ref x) => x.fmt(f),
+ }
+ }
+}
+
+impl PartialEq<u32> for Codepoints {
+ fn eq(&self, other: &u32) -> bool {
+ match *self {
+ Codepoints::Single(ref x) => x == other,
+ Codepoints::Range(ref x) => x == &(*other, *other),
+ }
+ }
+}
+
+impl PartialEq<Codepoint> for Codepoints {
+ fn eq(&self, other: &Codepoint) -> bool {
+ match *self {
+ Codepoints::Single(ref x) => x == other,
+ Codepoints::Range(ref x) => x == &(*other, *other),
+ }
+ }
+}
+
+impl PartialEq<(u32, u32)> for Codepoints {
+ fn eq(&self, other: &(u32, u32)) -> bool {
+ match *self {
+ Codepoints::Single(ref x) => &(x.value(), x.value()) == other,
+ Codepoints::Range(ref x) => x == other,
+ }
+ }
+}
+
+impl PartialEq<(Codepoint, Codepoint)> for Codepoints {
+ fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
+ match *self {
+ Codepoints::Single(ref x) => &(*x, *x) == other,
+ Codepoints::Range(ref x) => x == other,
+ }
+ }
+}
+
+/// A range of Unicode codepoints. The range is inclusive; both ends of the
+/// range are guaranteed to be valid codepoints.
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub struct CodepointRange {
+ /// The start of the codepoint range.
+ pub start: Codepoint,
+ /// The end of the codepoint range.
+ pub end: Codepoint,
+}
+
+impl IntoIterator for CodepointRange {
+ type IntoIter = CodepointIter;
+ type Item = Codepoint;
+
+ fn into_iter(self) -> CodepointIter {
+ CodepointIter { next: self.start.value(), range: self }
+ }
+}
+
+impl FromStr for CodepointRange {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<CodepointRange, Error> {
+ lazy_static! {
+ static ref PARTS: Regex =
+ Regex::new(r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$")
+ .unwrap();
+ }
+ let caps = match PARTS.captures(s) {
+ Some(caps) => caps,
+ None => return err!("invalid codepoint range: '{}'", s),
+ };
+ let start = caps["start"].parse().or_else(|err| {
+ err!("failed to parse '{}' as a codepoint range: {}", s, err)
+ })?;
+ let end = caps["end"].parse().or_else(|err| {
+ err!("failed to parse '{}' as a codepoint range: {}", s, err)
+ })?;
+ Ok(CodepointRange { start, end })
+ }
+}
+
+impl fmt::Display for CodepointRange {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{}..{}", self.start, self.end)
+ }
+}
+
+impl PartialEq<(u32, u32)> for CodepointRange {
+ fn eq(&self, other: &(u32, u32)) -> bool {
+ &(self.start.value(), self.end.value()) == other
+ }
+}
+
+impl PartialEq<(Codepoint, Codepoint)> for CodepointRange {
+ fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
+ &(self.start, self.end) == other
+ }
+}
+
+/// A single Unicode codepoint.
+///
+/// This type's string representation is a hexadecimal number. It is guaranteed
+/// to be in the range `[0, 10FFFF]`.
+///
+/// Note that unlike Rust's `char` type, this may be a surrogate codepoint.
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub struct Codepoint(u32);
+
+impl Codepoint {
+ /// Create a new codepoint from a `u32`.
+ ///
+ /// If the given number is not a valid codepoint, then this returns an
+ /// error.
+ pub fn from_u32(n: u32) -> Result<Codepoint, Error> {
+ if n > 0x10FFFF {
+ err!("{:x} is not a valid Unicode codepoint", n)
+ } else {
+ Ok(Codepoint(n))
+ }
+ }
+
+ /// Return the underlying `u32` codepoint value.
+ pub fn value(self) -> u32 {
+ self.0
+ }
+
+ /// Attempt to convert this codepoint to a Unicode scalar value.
+ ///
+ /// If this is a surrogate codepoint, then this returns `None`.
+ pub fn scalar(self) -> Option<char> {
+ char::from_u32(self.0)
+ }
+}
+
+impl IntoIterator for Codepoint {
+ type IntoIter = CodepointIter;
+ type Item = Codepoint;
+
+ fn into_iter(self) -> CodepointIter {
+ let range = CodepointRange { start: self, end: self };
+ CodepointIter { next: self.value(), range }
+ }
+}
+
+impl FromStr for Codepoint {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<Codepoint, Error> {
+ match u32::from_str_radix(s, 16) {
+ Ok(n) => Codepoint::from_u32(n),
+ Err(err) => {
+ return err!(
+ "failed to parse '{}' as a hexadecimal codepoint: {}",
+ s,
+ err
+ );
+ }
+ }
+ }
+}
+
+impl fmt::Display for Codepoint {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{:04X}", self.0)
+ }
+}
+
+impl PartialEq<u32> for Codepoint {
+ fn eq(&self, other: &u32) -> bool {
+ self.0 == *other
+ }
+}
+
+impl PartialEq<Codepoint> for u32 {
+ fn eq(&self, other: &Codepoint) -> bool {
+ *self == other.0
+ }
+}
+
+/// An iterator over a range of Unicode codepoints.
+#[derive(Debug)]
+pub struct CodepointIter {
+ next: u32,
+ range: CodepointRange,
+}
+
+impl Iterator for CodepointIter {
+ type Item = Codepoint;
+
+ fn next(&mut self) -> Option<Codepoint> {
+ if self.next > self.range.end.value() {
+ return None;
+ }
+ let current = self.next;
+ self.next += 1;
+ Some(Codepoint::from_u32(current).unwrap())
+ }
+}
diff --git a/vendor/ucd-parse/src/core_properties.rs b/vendor/ucd-parse/src/core_properties.rs
new file mode 100644
index 000000000..9a7682b43
--- /dev/null
+++ b/vendor/ucd-parse/src/core_properties.rs
@@ -0,0 +1,60 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+ parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+ UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `DerivedCoreProperties.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct CoreProperty {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The property name assigned to the codepoints in this entry.
+ pub property: String,
+}
+
+impl UcdFile for CoreProperty {
+ fn relative_file_path() -> &'static Path {
+ Path::new("DerivedCoreProperties.txt")
+ }
+}
+
+impl UcdFileByCodepoint for CoreProperty {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for CoreProperty {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<CoreProperty, Error> {
+ let (codepoints, property) = parse_codepoint_association(line)?;
+ Ok(CoreProperty { codepoints, property: property.to_string() })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::CoreProperty;
+
+ #[test]
+ fn parse_single() {
+ let line =
+ "1163D ; Case_Ignorable # Mn MODI SIGN ANUSVARA\n";
+ let row: CoreProperty = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x1163D);
+ assert_eq!(row.property, "Case_Ignorable");
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "11133..11134 ; Grapheme_Link # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA\n";
+ let row: CoreProperty = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0x11133, 0x11134));
+ assert_eq!(row.property, "Grapheme_Link");
+ }
+}
diff --git a/vendor/ucd-parse/src/emoji_properties.rs b/vendor/ucd-parse/src/emoji_properties.rs
new file mode 100644
index 000000000..dc5c0c884
--- /dev/null
+++ b/vendor/ucd-parse/src/emoji_properties.rs
@@ -0,0 +1,86 @@
+use std::path::{Path, PathBuf};
+use std::str::FromStr;
+
+use crate::common::{
+ parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+ UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `emoji-data.txt` file.
+///
+/// The `emoji-data.txt` file is the source of truth on several Emoji-related
+/// Unicode properties.
+///
+/// Note that `emoji-data.txt` is not formally part of the Unicode Character
+/// Database. You can download the Emoji data files separately here:
+/// https://unicode.org/Public/emoji/
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct EmojiProperty {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The property name assigned to the codepoints in this entry.
+ pub property: String,
+}
+
+impl UcdFile for EmojiProperty {
+ fn relative_file_path() -> &'static Path {
+ Path::new("emoji/emoji-data.txt")
+ }
+
+ fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
+ let ucd_dir = ucd_dir.as_ref();
+ // The standard location, but only on UCDs from 13.0.0 and up.
+ let std = ucd_dir.join(Self::relative_file_path());
+ if std.exists() {
+ std
+ } else {
+ // If the old location does exist, use it.
+ let legacy = ucd_dir.join("emoji-data.txt");
+ if legacy.exists() {
+ legacy
+ } else {
+ // This might end up in an error message, so use the standard
+ // one if forced to choose. Arguably we could do something like
+ // peek
+ std
+ }
+ }
+ }
+}
+
+impl UcdFileByCodepoint for EmojiProperty {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for EmojiProperty {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<EmojiProperty, Error> {
+ let (codepoints, property) = parse_codepoint_association(line)?;
+ Ok(EmojiProperty { codepoints, property: property.to_string() })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::EmojiProperty;
+
+ #[test]
+ fn parse_single() {
+ let line = "24C2 ; Emoji # 1.1 [1] (Ⓜ️) circled M\n";
+ let row: EmojiProperty = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x24C2);
+ assert_eq!(row.property, "Emoji");
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "1FA6E..1FFFD ; Extended_Pictographic# NA[1424] (🩮️..🿽️) <reserved-1FA6E>..<reserved-1FFFD>\n";
+ let row: EmojiProperty = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0x1FA6E, 0x1FFFD));
+ assert_eq!(row.property, "Extended_Pictographic");
+ }
+}
diff --git a/vendor/ucd-parse/src/error.rs b/vendor/ucd-parse/src/error.rs
new file mode 100644
index 000000000..9dafc4b33
--- /dev/null
+++ b/vendor/ucd-parse/src/error.rs
@@ -0,0 +1,86 @@
+use std::error;
+use std::fmt;
+use std::io;
+use std::path::{Path, PathBuf};
+
+/// Represents any kind of error that can occur while parsing the UCD.
+#[derive(Debug)]
+pub struct Error {
+ pub(crate) kind: ErrorKind,
+ pub(crate) line: Option<u64>,
+ pub(crate) path: Option<PathBuf>,
+}
+
+/// The kind of error that occurred while parsing the UCD.
+#[derive(Debug)]
+pub enum ErrorKind {
+ /// An I/O error.
+ Io(io::Error),
+ /// A generic parse error.
+ Parse(String),
+}
+
+impl Error {
+ /// Create a new parse error from the given message.
+ pub(crate) fn parse(msg: String) -> Error {
+ Error { kind: ErrorKind::Parse(msg), line: None, path: None }
+ }
+
+ /// Return the specific kind of this error.
+ pub fn kind(&self) -> &ErrorKind {
+ &self.kind
+ }
+
+ /// Return the line number at which this error occurred, if available.
+ pub fn line(&self) -> Option<u64> {
+ self.line
+ }
+
+ /// Return the file path associated with this error, if one exists.
+ pub fn path(&self) -> Option<&Path> {
+ self.path.as_ref().map(|p| &**p)
+ }
+
+ /// Unwrap this error into its underlying kind.
+ pub fn into_kind(self) -> ErrorKind {
+ self.kind
+ }
+
+ /// Returns true if and only if this is an I/O error.
+ ///
+ /// If this returns true, the underlying `ErrorKind` is guaranteed to be
+ /// `ErrorKind::Io`.
+ pub fn is_io_error(&self) -> bool {
+ match self.kind {
+ ErrorKind::Io(_) => true,
+ _ => false,
+ }
+ }
+}
+
+impl error::Error for Error {
+ fn cause(&self) -> Option<&dyn error::Error> {
+ match self.kind {
+ ErrorKind::Io(ref err) => Some(err),
+ _ => None,
+ }
+ }
+}
+
+impl fmt::Display for Error {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if let Some(ref path) = self.path {
+ if let Some(line) = self.line {
+ write!(f, "{}:{}: ", path.display(), line)?;
+ } else {
+ write!(f, "{}: ", path.display())?;
+ }
+ } else if let Some(line) = self.line {
+ write!(f, "error on line {}: ", line)?;
+ }
+ match self.kind {
+ ErrorKind::Io(ref err) => write!(f, "{}", err),
+ ErrorKind::Parse(ref msg) => write!(f, "{}", msg),
+ }
+ }
+}
diff --git a/vendor/ucd-parse/src/grapheme_cluster_break.rs b/vendor/ucd-parse/src/grapheme_cluster_break.rs
new file mode 100644
index 000000000..9dbf32f41
--- /dev/null
+++ b/vendor/ucd-parse/src/grapheme_cluster_break.rs
@@ -0,0 +1,98 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+ parse_break_test, parse_codepoint_association, CodepointIter, Codepoints,
+ UcdFile, UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `auxiliary/GraphemeBreakProperty.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct GraphemeClusterBreak {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The property value assigned to the codepoints in this entry.
+ pub value: String,
+}
+
+impl UcdFile for GraphemeClusterBreak {
+ fn relative_file_path() -> &'static Path {
+ Path::new("auxiliary/GraphemeBreakProperty.txt")
+ }
+}
+
+impl UcdFileByCodepoint for GraphemeClusterBreak {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for GraphemeClusterBreak {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<GraphemeClusterBreak, Error> {
+ let (codepoints, value) = parse_codepoint_association(line)?;
+ Ok(GraphemeClusterBreak { codepoints, value: value.to_string() })
+ }
+}
+
+/// A single row in the `auxiliary/GraphemeBreakTest.txt` file.
+///
+/// This file defines tests for the grapheme cluster break algorithm.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct GraphemeClusterBreakTest {
+ /// Each string is a UTF-8 encoded group of codepoints that make up a
+ /// single grapheme cluster.
+ pub grapheme_clusters: Vec<String>,
+ /// A human readable description of this test.
+ pub comment: String,
+}
+
+impl UcdFile for GraphemeClusterBreakTest {
+ fn relative_file_path() -> &'static Path {
+ Path::new("auxiliary/GraphemeBreakTest.txt")
+ }
+}
+
+impl FromStr for GraphemeClusterBreakTest {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<GraphemeClusterBreakTest, Error> {
+ let (groups, comment) = parse_break_test(line)?;
+ Ok(GraphemeClusterBreakTest { grapheme_clusters: groups, comment })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{GraphemeClusterBreak, GraphemeClusterBreakTest};
+
+ #[test]
+ fn parse_single() {
+ let line = "093B ; SpacingMark # Mc DEVANAGARI VOWEL SIGN OOE\n";
+ let row: GraphemeClusterBreak = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x093B);
+ assert_eq!(row.value, "SpacingMark");
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z\n";
+ let row: GraphemeClusterBreak = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0x1F1E6, 0x1F1FF));
+ assert_eq!(row.value, "Regional_Indicator");
+ }
+
+ #[test]
+ fn parse_test() {
+ let line = "÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]\n";
+
+ let row: GraphemeClusterBreakTest = line.parse().unwrap();
+ assert_eq!(
+ row.grapheme_clusters,
+ vec!["\u{0061}\u{1F3FF}", "\u{1F476}\u{200D}\u{1F6D1}",]
+ );
+ assert!(row.comment.starts_with("÷ [0.2] LATIN SMALL LETTER A"));
+ }
+}
diff --git a/vendor/ucd-parse/src/jamo_short_name.rs b/vendor/ucd-parse/src/jamo_short_name.rs
new file mode 100644
index 000000000..4103dd7ee
--- /dev/null
+++ b/vendor/ucd-parse/src/jamo_short_name.rs
@@ -0,0 +1,80 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// A single row in the `Jamo.txt` file.
+///
+/// The `Jamo.txt` file defines the `Jamo_Short_Name` property.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct JamoShortName {
+ /// The codepoint corresponding to this row.
+ pub codepoint: Codepoint,
+ /// The actual "Jamo Short Name." This string contains at most 3 bytes and
+ /// may be empty.
+ pub name: String,
+}
+
+impl UcdFile for JamoShortName {
+ fn relative_file_path() -> &'static Path {
+ Path::new("Jamo.txt")
+ }
+}
+
+impl UcdFileByCodepoint for JamoShortName {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoint.into_iter()
+ }
+}
+
+impl FromStr for JamoShortName {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<JamoShortName, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ (?P<codepoint>[A-Z0-9]+);
+ \s*
+ (?P<name>[A-Z]*)
+ "
+ )
+ .unwrap();
+ };
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid Jamo_Short_name line"),
+ };
+ Ok(JamoShortName {
+ codepoint: caps["codepoint"].parse()?,
+ name: caps.name("name").unwrap().as_str().to_string(),
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::JamoShortName;
+
+ #[test]
+ fn parse1() {
+ let line = "1164; YAE # HANGUL JUNGSEONG YAE\n";
+ let row: JamoShortName = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x1164);
+ assert_eq!(row.name, "YAE");
+ }
+
+ #[test]
+ fn parse2() {
+ let line = "110B; # HANGUL CHOSEONG IEUNG\n";
+ let row: JamoShortName = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x110B);
+ assert_eq!(row.name, "");
+ }
+}
diff --git a/vendor/ucd-parse/src/lib.rs b/vendor/ucd-parse/src/lib.rs
new file mode 100644
index 000000000..f6654658a
--- /dev/null
+++ b/vendor/ucd-parse/src/lib.rs
@@ -0,0 +1,66 @@
+/*!
+A library for parsing the Unicode character database.
+*/
+
+#![deny(missing_docs)]
+
+pub use crate::common::{
+ parse, parse_by_codepoint, parse_many_by_codepoint, ucd_directory_version,
+ Codepoint, CodepointIter, CodepointRange, Codepoints, UcdFile,
+ UcdFileByCodepoint, UcdLineParser,
+};
+pub use crate::error::{Error, ErrorKind};
+
+pub use crate::age::Age;
+pub use crate::arabic_shaping::ArabicShaping;
+pub use crate::bidi_mirroring_glyph::BidiMirroring;
+pub use crate::case_folding::{CaseFold, CaseStatus};
+pub use crate::core_properties::CoreProperty;
+pub use crate::emoji_properties::EmojiProperty;
+pub use crate::grapheme_cluster_break::{
+ GraphemeClusterBreak, GraphemeClusterBreakTest,
+};
+pub use crate::jamo_short_name::JamoShortName;
+pub use crate::line_break::LineBreakTest;
+pub use crate::name_aliases::{NameAlias, NameAliasLabel};
+pub use crate::prop_list::Property;
+pub use crate::property_aliases::PropertyAlias;
+pub use crate::property_value_aliases::PropertyValueAlias;
+pub use crate::script_extensions::ScriptExtension;
+pub use crate::scripts::Script;
+pub use crate::sentence_break::{SentenceBreak, SentenceBreakTest};
+pub use crate::special_casing::SpecialCaseMapping;
+pub use crate::unicode_data::{
+ UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
+ UnicodeDataExpander, UnicodeDataNumeric,
+};
+pub use crate::word_break::{WordBreak, WordBreakTest};
+
+macro_rules! err {
+ ($($tt:tt)*) => {
+ Err(crate::error::Error::parse(format!($($tt)*)))
+ }
+}
+
+mod common;
+mod error;
+
+mod age;
+mod arabic_shaping;
+mod bidi_mirroring_glyph;
+mod case_folding;
+mod core_properties;
+mod emoji_properties;
+mod grapheme_cluster_break;
+mod jamo_short_name;
+mod line_break;
+mod name_aliases;
+mod prop_list;
+mod property_aliases;
+mod property_value_aliases;
+mod script_extensions;
+mod scripts;
+mod sentence_break;
+mod special_casing;
+mod unicode_data;
+mod word_break;
diff --git a/vendor/ucd-parse/src/line_break.rs b/vendor/ucd-parse/src/line_break.rs
new file mode 100644
index 000000000..aa62fcb9e
--- /dev/null
+++ b/vendor/ucd-parse/src/line_break.rs
@@ -0,0 +1,49 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{parse_break_test, UcdFile};
+use crate::error::Error;
+
+/// A single row in the `auxiliary/LineBreakTest.txt` file.
+///
+/// This file defines tests for the line break algorithm.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct LineBreakTest {
+ /// Each string is a UTF-8 encoded group of codepoints that make up a
+ /// single line.
+ pub lines: Vec<String>,
+ /// A human readable description of this test.
+ pub comment: String,
+}
+
+impl UcdFile for LineBreakTest {
+ fn relative_file_path() -> &'static Path {
+ Path::new("auxiliary/LineBreakTest.txt")
+ }
+}
+
+impl FromStr for LineBreakTest {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<LineBreakTest, Error> {
+ let (groups, comment) = parse_break_test(line)?;
+ Ok(LineBreakTest { lines: groups, comment })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::LineBreakTest;
+
+ #[test]
+ fn parse_test() {
+ let line = "× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [30.13] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]";
+
+ let row: LineBreakTest = line.parse().unwrap();
+ assert_eq!(
+ row.lines,
+ vec!["\u{1F1F7}\u{1F1FA}", "\u{1F1F8}\u{1F1EA}",]
+ );
+ assert!(row.comment.ends_with("(RI) ÷ [0.3]"));
+ }
+}
diff --git a/vendor/ucd-parse/src/name_aliases.rs b/vendor/ucd-parse/src/name_aliases.rs
new file mode 100644
index 000000000..36c9c4b01
--- /dev/null
+++ b/vendor/ucd-parse/src/name_aliases.rs
@@ -0,0 +1,145 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// A single row in the `NameAliases.txt` file.
+///
+/// Note that there are multiple rows for some codepoint. Each row provides a
+/// new alias.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct NameAlias {
+ /// The codepoint corresponding to this row.
+ pub codepoint: Codepoint,
+ /// The alias.
+ pub alias: String,
+ /// The label of this alias.
+ pub label: NameAliasLabel,
+}
+
+impl UcdFile for NameAlias {
+ fn relative_file_path() -> &'static Path {
+ Path::new("NameAliases.txt")
+ }
+}
+
+impl UcdFileByCodepoint for NameAlias {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoint.into_iter()
+ }
+}
+
+impl FromStr for NameAlias {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<NameAlias, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ (?P<codepoint>[A-Z0-9]+);
+ \s*
+ (?P<alias>[^;]+);
+ \s*
+ (?P<label>\S+)
+ "
+ )
+ .unwrap();
+ };
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid NameAliases line"),
+ };
+ Ok(NameAlias {
+ codepoint: caps["codepoint"].parse()?,
+ alias: caps.name("alias").unwrap().as_str().to_string(),
+ label: caps["label"].parse()?,
+ })
+ }
+}
+
+/// The label of a name alias.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum NameAliasLabel {
+ /// Corrections for serious problems in a character name.
+ Correction,
+ /// ISO 6429 names for C0 and C1 control functions and other commonly
+ /// occurring names for control codes.
+ Control,
+ /// A few widely used alternate names for format characters.
+ Alternate,
+ /// Several documented labels for C1 control code points which were
+ /// never actually approved in any standard.
+ Figment,
+ /// Commonly occurring abbreviations (or acronyms) for control codes,
+ /// format characters, spaces and variation selectors.
+ Abbreviation,
+}
+
+impl Default for NameAliasLabel {
+ fn default() -> NameAliasLabel {
+ // This is arbitrary, but the Default impl is convenient.
+ NameAliasLabel::Correction
+ }
+}
+
+impl FromStr for NameAliasLabel {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<NameAliasLabel, Error> {
+ match s {
+ "correction" => Ok(NameAliasLabel::Correction),
+ "control" => Ok(NameAliasLabel::Control),
+ "alternate" => Ok(NameAliasLabel::Alternate),
+ "figment" => Ok(NameAliasLabel::Figment),
+ "abbreviation" => Ok(NameAliasLabel::Abbreviation),
+ unknown => err!("unknown name alias label: '{}'", unknown),
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{NameAlias, NameAliasLabel};
+
+ #[test]
+ fn parse1() {
+ let line = "0000;NULL;control\n";
+ let row: NameAlias = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x0);
+ assert_eq!(row.alias, "NULL");
+ assert_eq!(row.label, NameAliasLabel::Control);
+ }
+
+ #[test]
+ fn parse2() {
+ let line = "000B;VERTICAL TABULATION;control\n";
+ let row: NameAlias = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0xB);
+ assert_eq!(row.alias, "VERTICAL TABULATION");
+ assert_eq!(row.label, NameAliasLabel::Control);
+ }
+
+ #[test]
+ fn parse3() {
+ let line = "0081;HIGH OCTET PRESET;figment\n";
+ let row: NameAlias = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x81);
+ assert_eq!(row.alias, "HIGH OCTET PRESET");
+ assert_eq!(row.label, NameAliasLabel::Figment);
+ }
+
+ #[test]
+ fn parse4() {
+ let line = "E01EF;VS256;abbreviation\n";
+ let row: NameAlias = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0xE01EF);
+ assert_eq!(row.alias, "VS256");
+ assert_eq!(row.label, NameAliasLabel::Abbreviation);
+ }
+}
diff --git a/vendor/ucd-parse/src/prop_list.rs b/vendor/ucd-parse/src/prop_list.rs
new file mode 100644
index 000000000..db830c57a
--- /dev/null
+++ b/vendor/ucd-parse/src/prop_list.rs
@@ -0,0 +1,63 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+ parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+ UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `PropList.txt` file.
+///
+/// The `PropList.txt` file is the source of truth on several Unicode
+/// properties.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Property {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The property name assigned to the codepoints in this entry.
+ pub property: String,
+}
+
+impl UcdFile for Property {
+ fn relative_file_path() -> &'static Path {
+ Path::new("PropList.txt")
+ }
+}
+
+impl UcdFileByCodepoint for Property {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for Property {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<Property, Error> {
+ let (codepoints, property) = parse_codepoint_association(line)?;
+ Ok(Property { codepoints, property: property.to_string() })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Property;
+
+ #[test]
+ fn parse_single() {
+ let line =
+ "061C ; Bidi_Control # Cf ARABIC LETTER MARK\n";
+ let row: Property = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x061C);
+ assert_eq!(row.property, "Bidi_Control");
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>\n";
+ let row: Property = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0x0009, 0x000D));
+ assert_eq!(row.property, "White_Space");
+ }
+}
diff --git a/vendor/ucd-parse/src/property_aliases.rs b/vendor/ucd-parse/src/property_aliases.rs
new file mode 100644
index 000000000..f94a116e6
--- /dev/null
+++ b/vendor/ucd-parse/src/property_aliases.rs
@@ -0,0 +1,113 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::UcdFile;
+use crate::error::Error;
+
+/// A single row in the `PropertyAliases.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct PropertyAlias {
+ /// An abbreviation for this property.
+ pub abbreviation: String,
+ /// The "long" name of this property.
+ pub long: String,
+ /// Additional aliases (if present).
+ pub aliases: Vec<String>,
+}
+
+impl UcdFile for PropertyAlias {
+ fn relative_file_path() -> &'static Path {
+ Path::new("PropertyAliases.txt")
+ }
+}
+
+impl FromStr for PropertyAlias {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<PropertyAlias, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ \s*(?P<abbrev>[^\s;]+)\s*;
+ \s*(?P<long>[^\s;]+)\s*
+ (?:;(?P<aliases>.*))?
+ "
+ )
+ .unwrap();
+ static ref ALIASES: Regex =
+ Regex::new(r"\s*(?P<alias>[^\s;]+)\s*;?\s*").unwrap();
+ };
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid PropertyAliases line: '{}'", line),
+ };
+ let mut aliases = vec![];
+ if let Some(m) = caps.name("aliases") {
+ for acaps in ALIASES.captures_iter(m.as_str()) {
+ let alias = acaps.name("alias").unwrap().as_str();
+ aliases.push(alias.to_string());
+ }
+ }
+ Ok(PropertyAlias {
+ abbreviation: caps.name("abbrev").unwrap().as_str().to_string(),
+ long: caps.name("long").unwrap().as_str().to_string(),
+ aliases,
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::PropertyAlias;
+
+ #[test]
+ fn parse1() {
+ let line = "cjkAccountingNumeric ; kAccountingNumeric\n";
+ let row: PropertyAlias = line.parse().unwrap();
+ assert_eq!(row.abbreviation, "cjkAccountingNumeric");
+ assert_eq!(row.long, "kAccountingNumeric");
+ assert!(row.aliases.is_empty());
+ }
+
+ #[test]
+ fn parse2() {
+ let line = "nv ; Numeric_Value\n";
+ let row: PropertyAlias = line.parse().unwrap();
+ assert_eq!(row.abbreviation, "nv");
+ assert_eq!(row.long, "Numeric_Value");
+ assert!(row.aliases.is_empty());
+ }
+
+ #[test]
+ fn parse3() {
+ let line =
+ "scf ; Simple_Case_Folding ; sfc\n";
+ let row: PropertyAlias = line.parse().unwrap();
+ assert_eq!(row.abbreviation, "scf");
+ assert_eq!(row.long, "Simple_Case_Folding");
+ assert_eq!(row.aliases, vec!["sfc"]);
+ }
+
+ #[test]
+ fn parse4() {
+ let line = "cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS\n";
+ let row: PropertyAlias = line.parse().unwrap();
+ assert_eq!(row.abbreviation, "cjkRSUnicode");
+ assert_eq!(row.long, "kRSUnicode");
+ assert_eq!(row.aliases, vec!["Unicode_Radical_Stroke", "URS"]);
+ }
+
+ #[test]
+ fn parse5() {
+ let line = "isc ; ISO_Comment";
+ let row: PropertyAlias = line.parse().unwrap();
+ assert_eq!(row.abbreviation, "isc");
+ assert_eq!(row.long, "ISO_Comment");
+ assert!(row.aliases.is_empty());
+ }
+}
diff --git a/vendor/ucd-parse/src/property_value_aliases.rs b/vendor/ucd-parse/src/property_value_aliases.rs
new file mode 100644
index 000000000..7e8a3c890
--- /dev/null
+++ b/vendor/ucd-parse/src/property_value_aliases.rs
@@ -0,0 +1,185 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::UcdFile;
+use crate::error::Error;
+
+/// A single row in the `PropertyValueAliases.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct PropertyValueAlias {
+ /// The property name for which this value alias applies.
+ pub property: String,
+ /// A numeric abbreviation for this property value, if present. (This is
+ /// seemingly only present for the `ccc`/`Canonical_Combining_Class`
+ /// property.)
+ pub numeric: Option<u8>,
+ /// An abbreviation for this property value.
+ pub abbreviation: String,
+ /// The "long" form of this property value.
+ pub long: String,
+ /// Additional value aliases (if present).
+ pub aliases: Vec<String>,
+}
+
+impl UcdFile for PropertyValueAlias {
+ fn relative_file_path() -> &'static Path {
+ Path::new("PropertyValueAliases.txt")
+ }
+}
+
+impl FromStr for PropertyValueAlias {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<PropertyValueAlias, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ \s*(?P<prop>[^\s;]+)\s*;
+ \s*(?P<abbrev>[^\s;]+)\s*;
+ \s*(?P<long>[^\s;]+)\s*
+ (?:;(?P<aliases>.*))?
+ "
+ )
+ .unwrap();
+ static ref PARTS_CCC: Regex = Regex::new(
+ r"(?x)
+ ^
+ ccc;
+ \s*(?P<num_class>[0-9]+)\s*;
+ \s*(?P<abbrev>[^\s;]+)\s*;
+ \s*(?P<long>[^\s;]+)
+ "
+ )
+ .unwrap();
+ static ref ALIASES: Regex =
+ Regex::new(r"\s*(?P<alias>[^\s;]+)\s*;?\s*").unwrap();
+ };
+
+ if line.starts_with("ccc;") {
+ let caps = match PARTS_CCC.captures(line.trim()) {
+ Some(caps) => caps,
+ None => {
+ return err!("invalid PropertyValueAliases (ccc) line")
+ }
+ };
+ let n = match caps["num_class"].parse() {
+ Ok(n) => n,
+ Err(err) => {
+ return err!(
+ "failed to parse ccc number '{}': {}",
+ &caps["num_class"],
+ err
+ )
+ }
+ };
+ let abbrev = caps.name("abbrev").unwrap().as_str();
+ let long = caps.name("long").unwrap().as_str();
+ return Ok(PropertyValueAlias {
+ property: line[0..3].to_string(),
+ numeric: Some(n),
+ abbreviation: abbrev.to_string(),
+ long: long.to_string(),
+ aliases: vec![],
+ });
+ }
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid PropertyValueAliases line"),
+ };
+ let mut aliases = vec![];
+ if let Some(m) = caps.name("aliases") {
+ for acaps in ALIASES.captures_iter(m.as_str()) {
+ let alias = acaps.name("alias").unwrap().as_str();
+ if alias == "#" {
+ // This starts a comment, so stop reading.
+ break;
+ }
+ aliases.push(alias.to_string());
+ }
+ }
+ Ok(PropertyValueAlias {
+ property: caps.name("prop").unwrap().as_str().to_string(),
+ numeric: None,
+ abbreviation: caps.name("abbrev").unwrap().as_str().to_string(),
+ long: caps.name("long").unwrap().as_str().to_string(),
+ aliases,
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::PropertyValueAlias;
+
+ #[test]
+ fn parse1() {
+ let line = "blk; Arabic_PF_A ; Arabic_Presentation_Forms_A ; Arabic_Presentation_Forms-A\n";
+ let row: PropertyValueAlias = line.parse().unwrap();
+ assert_eq!(row.property, "blk");
+ assert_eq!(row.numeric, None);
+ assert_eq!(row.abbreviation, "Arabic_PF_A");
+ assert_eq!(row.long, "Arabic_Presentation_Forms_A");
+ assert_eq!(row.aliases, vec!["Arabic_Presentation_Forms-A"]);
+ }
+
+ #[test]
+ fn parse2() {
+ let line = "AHex; N ; No ; F ; False\n";
+ let row: PropertyValueAlias = line.parse().unwrap();
+ assert_eq!(row.property, "AHex");
+ assert_eq!(row.numeric, None);
+ assert_eq!(row.abbreviation, "N");
+ assert_eq!(row.long, "No");
+ assert_eq!(row.aliases, vec!["F", "False"]);
+ }
+
+ #[test]
+ fn parse3() {
+ let line = "age; 1.1 ; V1_1\n";
+ let row: PropertyValueAlias = line.parse().unwrap();
+ assert_eq!(row.property, "age");
+ assert_eq!(row.numeric, None);
+ assert_eq!(row.abbreviation, "1.1");
+ assert_eq!(row.long, "V1_1");
+ assert!(row.aliases.is_empty());
+ }
+
+ #[test]
+ fn parse4() {
+ let line = "ccc; 0; NR ; Not_Reordered\n";
+ let row: PropertyValueAlias = line.parse().unwrap();
+ assert_eq!(row.property, "ccc");
+ assert_eq!(row.numeric, Some(0));
+ assert_eq!(row.abbreviation, "NR");
+ assert_eq!(row.long, "Not_Reordered");
+ assert!(row.aliases.is_empty());
+ }
+
+ #[test]
+ fn parse5() {
+ let line =
+ "ccc; 133; CCC133 ; CCC133 # RESERVED\n";
+ let row: PropertyValueAlias = line.parse().unwrap();
+ assert_eq!(row.property, "ccc");
+ assert_eq!(row.numeric, Some(133));
+ assert_eq!(row.abbreviation, "CCC133");
+ assert_eq!(row.long, "CCC133");
+ assert!(row.aliases.is_empty());
+ }
+
+ #[test]
+ fn parse6() {
+ let line = "gc ; P ; Punctuation ; punct # Pc | Pd | Pe | Pf | Pi | Po | Ps\n";
+ let row: PropertyValueAlias = line.parse().unwrap();
+ assert_eq!(row.property, "gc");
+ assert_eq!(row.numeric, None);
+ assert_eq!(row.abbreviation, "P");
+ assert_eq!(row.long, "Punctuation");
+ assert_eq!(row.aliases, vec!["punct"]);
+ }
+}
diff --git a/vendor/ucd-parse/src/script_extensions.rs b/vendor/ucd-parse/src/script_extensions.rs
new file mode 100644
index 000000000..050e1f039
--- /dev/null
+++ b/vendor/ucd-parse/src/script_extensions.rs
@@ -0,0 +1,68 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+ parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+ UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `ScriptExtensions.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct ScriptExtension {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The script extension names assigned to the codepoints in this entry.
+ pub scripts: Vec<String>,
+}
+
+impl UcdFile for ScriptExtension {
+ fn relative_file_path() -> &'static Path {
+ Path::new("ScriptExtensions.txt")
+ }
+}
+
+impl UcdFileByCodepoint for ScriptExtension {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for ScriptExtension {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<ScriptExtension, Error> {
+ let (codepoints, scripts) = parse_codepoint_association(line)?;
+ Ok(ScriptExtension {
+ codepoints,
+ scripts: scripts.split_whitespace().map(str::to_string).collect(),
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::ScriptExtension;
+
+ #[test]
+ fn parse_single() {
+ let line = "060C ; Arab Syrc Thaa # Po ARABIC COMMA\n";
+ let row: ScriptExtension = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x060C);
+ assert_eq!(row.scripts, vec!["Arab", "Syrc", "Thaa"]);
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "A836..A837 ; Deva Gujr Guru Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK\n";
+ let row: ScriptExtension = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0xA836, 0xA837));
+ assert_eq!(
+ row.scripts,
+ vec![
+ "Deva", "Gujr", "Guru", "Kthi", "Mahj", "Modi", "Sind",
+ "Takr", "Tirh",
+ ]
+ );
+ }
+}
diff --git a/vendor/ucd-parse/src/scripts.rs b/vendor/ucd-parse/src/scripts.rs
new file mode 100644
index 000000000..6021912c4
--- /dev/null
+++ b/vendor/ucd-parse/src/scripts.rs
@@ -0,0 +1,59 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+ parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+ UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `Scripts.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Script {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The script name assigned to the codepoints in this entry.
+ pub script: String,
+}
+
+impl UcdFile for Script {
+ fn relative_file_path() -> &'static Path {
+ Path::new("Scripts.txt")
+ }
+}
+
+impl UcdFileByCodepoint for Script {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for Script {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<Script, Error> {
+ let (codepoints, script) = parse_codepoint_association(line)?;
+ Ok(Script { codepoints, script: script.to_string() })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Script;
+
+ #[test]
+ fn parse_single() {
+ let line = "10A7F ; Old_South_Arabian # Po OLD SOUTH ARABIAN NUMERIC INDICATOR\n";
+ let row: Script = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x10A7F);
+ assert_eq!(row.script, "Old_South_Arabian");
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "1200..1248 ; Ethiopic # Lo [73] ETHIOPIC SYLLABLE HA..ETHIOPIC SYLLABLE QWA\n";
+ let row: Script = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0x1200, 0x1248));
+ assert_eq!(row.script, "Ethiopic");
+ }
+}
diff --git a/vendor/ucd-parse/src/sentence_break.rs b/vendor/ucd-parse/src/sentence_break.rs
new file mode 100644
index 000000000..74a6e8a08
--- /dev/null
+++ b/vendor/ucd-parse/src/sentence_break.rs
@@ -0,0 +1,101 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+ parse_break_test, parse_codepoint_association, CodepointIter, Codepoints,
+ UcdFile, UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `auxiliary/SentenceBreakProperty.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct SentenceBreak {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The property value assigned to the codepoints in this entry.
+ pub value: String,
+}
+
+impl UcdFile for SentenceBreak {
+ fn relative_file_path() -> &'static Path {
+ Path::new("auxiliary/SentenceBreakProperty.txt")
+ }
+}
+
+impl UcdFileByCodepoint for SentenceBreak {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for SentenceBreak {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<SentenceBreak, Error> {
+ let (codepoints, value) = parse_codepoint_association(line)?;
+ Ok(SentenceBreak { codepoints, value: value.to_string() })
+ }
+}
+
+/// A single row in the `auxiliary/SentenceBreakTest.txt` file.
+///
+/// This file defines tests for the sentence break algorithm.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct SentenceBreakTest {
+ /// Each string is a UTF-8 encoded group of codepoints that make up a
+ /// single sentence.
+ pub sentences: Vec<String>,
+ /// A human readable description of this test.
+ pub comment: String,
+}
+
+impl UcdFile for SentenceBreakTest {
+ fn relative_file_path() -> &'static Path {
+ Path::new("auxiliary/SentenceBreakTest.txt")
+ }
+}
+
+impl FromStr for SentenceBreakTest {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<SentenceBreakTest, Error> {
+ let (groups, comment) = parse_break_test(line)?;
+ Ok(SentenceBreakTest { sentences: groups, comment })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{SentenceBreak, SentenceBreakTest};
+
+ #[test]
+ fn parse_single() {
+ let line = "11445 ; Extend # Mc NEWA SIGN VISARGA\n";
+ let row: SentenceBreak = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x11445);
+ assert_eq!(row.value, "Extend");
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "FE31..FE32 ; SContinue # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH\n";
+ let row: SentenceBreak = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0xFE31, 0xFE32));
+ assert_eq!(row.value, "SContinue");
+ }
+
+ #[test]
+ fn parse_test() {
+ let line = "÷ 2060 × 5B57 × 2060 × 002E × 2060 ÷ 5B57 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [998.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [998.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]";
+
+ let row: SentenceBreakTest = line.parse().unwrap();
+ assert_eq!(
+ row.sentences,
+ vec![
+ "\u{2060}\u{5B57}\u{2060}\u{002E}\u{2060}",
+ "\u{5B57}\u{2060}\u{2060}",
+ ]
+ );
+ assert!(row.comment.contains("[5.0] WORD JOINER (Format_FE)"));
+ }
+}
diff --git a/vendor/ucd-parse/src/special_casing.rs b/vendor/ucd-parse/src/special_casing.rs
new file mode 100644
index 000000000..a8fc61ddb
--- /dev/null
+++ b/vendor/ucd-parse/src/special_casing.rs
@@ -0,0 +1,112 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{
+ parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile,
+ UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `SpecialCasing.txt` file.
+///
+/// Note that a single codepoint may be mapped multiple times. In particular,
+/// a single codepoint might have mappings based on distinct language sensitive
+/// conditions (e.g., `U+0307`).
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct SpecialCaseMapping {
+ /// The codepoint that is being mapped.
+ pub codepoint: Codepoint,
+ /// The lowercase mapping, which may be empty.
+ pub lowercase: Vec<Codepoint>,
+ /// The titlecase mapping, which may be empty.
+ pub titlecase: Vec<Codepoint>,
+ /// The uppercase mapping, which may be empty.
+ pub uppercase: Vec<Codepoint>,
+ /// A list of language specific conditions, see `SpecialCasing.txt` for
+ /// more details.
+ pub conditions: Vec<String>,
+}
+
+impl UcdFile for SpecialCaseMapping {
+ fn relative_file_path() -> &'static Path {
+ Path::new("SpecialCasing.txt")
+ }
+}
+
+impl UcdFileByCodepoint for SpecialCaseMapping {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoint.into_iter()
+ }
+}
+
+impl FromStr for SpecialCaseMapping {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ \s*(?P<codepoint>[^\s;]+)\s*;
+ \s*(?P<lower>[^;]+)\s*;
+ \s*(?P<title>[^;]+)\s*;
+ \s*(?P<upper>[^;]+)\s*;
+ \s*(?P<conditions>[^;\x23]+)?
+ "
+ )
+ .unwrap();
+ };
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid SpecialCasing line: '{}'", line),
+ };
+ let conditions = caps
+ .name("conditions")
+ .map(|x| {
+ x.as_str()
+ .trim()
+ .split_whitespace()
+ .map(|c| c.to_string())
+ .collect()
+ })
+ .unwrap_or(vec![]);
+ Ok(SpecialCaseMapping {
+ codepoint: caps["codepoint"].parse()?,
+ lowercase: parse_codepoint_sequence(&caps["lower"])?,
+ titlecase: parse_codepoint_sequence(&caps["title"])?,
+ uppercase: parse_codepoint_sequence(&caps["upper"])?,
+ conditions,
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::SpecialCaseMapping;
+
+ #[test]
+ fn parse_no_conds() {
+ let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA\n";
+ let row: SpecialCaseMapping = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x1F52);
+ assert_eq!(row.lowercase, vec![0x1F52]);
+ assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]);
+ assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]);
+ assert!(row.conditions.is_empty());
+ }
+
+ #[test]
+ fn parse_conds() {
+ let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\n";
+ let row: SpecialCaseMapping = line.parse().unwrap();
+ assert_eq!(row.codepoint, 0x0307);
+ assert!(row.lowercase.is_empty());
+ assert_eq!(row.titlecase, vec![0x0307]);
+ assert_eq!(row.uppercase, vec![0x0307]);
+ assert_eq!(row.conditions, vec!["tr", "After_I"]);
+ }
+}
diff --git a/vendor/ucd-parse/src/unicode_data.rs b/vendor/ucd-parse/src/unicode_data.rs
new file mode 100644
index 000000000..87910cc1d
--- /dev/null
+++ b/vendor/ucd-parse/src/unicode_data.rs
@@ -0,0 +1,787 @@
+use std::fmt;
+use std::iter;
+use std::ops::Range;
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// Represents a single row in the `UnicodeData.txt` file.
+///
+/// These fields were taken from UAX44, Table 9, as part of the documentation
+/// for the
+/// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt).
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct UnicodeData {
+ /// The codepoint corresponding to this row.
+ pub codepoint: Codepoint,
+ /// The name of this codepoint.
+ pub name: String,
+ /// The "general category" of this codepoint.
+ pub general_category: String,
+ /// The class of this codepoint used in the Canonical Ordering Algorithm.
+ ///
+ /// Note that some classes map to a particular symbol. See
+ /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
+ pub canonical_combining_class: u8,
+ /// The bidirectional class of this codepoint.
+ ///
+ /// Possible values are listed in
+ /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values).
+ pub bidi_class: String,
+ /// The decomposition mapping for this codepoint. This includes its
+ /// formatting tag (if present).
+ pub decomposition: UnicodeDataDecomposition,
+ /// A decimal numeric representation of this codepoint, if it has the
+ /// property `Numeric_Type=Decimal`.
+ pub numeric_type_decimal: Option<u8>,
+ /// A decimal numeric representation of this codepoint, if it has the
+ /// property `Numeric_Type=Digit`. Note that while this field is still
+ /// populated for existing codepoints, no new codepoints will have this
+ /// field populated.
+ pub numeric_type_digit: Option<u8>,
+ /// A decimal or rational numeric representation of this codepoint, if it
+ /// has the property `Numeric_Type=Numeric`.
+ pub numeric_type_numeric: Option<UnicodeDataNumeric>,
+ /// A boolean indicating whether this codepoint is "mirrored" in
+ /// bidirectional text.
+ pub bidi_mirrored: bool,
+ /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
+ /// this field is empty unless it is significantly different from
+ /// the `name` field.
+ pub unicode1_name: String,
+ /// The ISO 10464 comment field. This no longer contains any non-NULL
+ /// values.
+ pub iso_comment: String,
+ /// This codepoint's simple uppercase mapping, if it exists.
+ pub simple_uppercase_mapping: Option<Codepoint>,
+ /// This codepoint's simple lowercase mapping, if it exists.
+ pub simple_lowercase_mapping: Option<Codepoint>,
+ /// This codepoint's simple titlecase mapping, if it exists.
+ pub simple_titlecase_mapping: Option<Codepoint>,
+}
+
+impl UcdFile for UnicodeData {
+ fn relative_file_path() -> &'static Path {
+ Path::new("UnicodeData.txt")
+ }
+}
+
+impl UcdFileByCodepoint for UnicodeData {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoint.into_iter()
+ }
+}
+
+impl UnicodeData {
+ /// Returns true if and only if this record corresponds to the start of a
+ /// range.
+ pub fn is_range_start(&self) -> bool {
+ self.name.starts_with('<')
+ && self.name.ends_with('>')
+ && self.name.contains("First")
+ }
+
+ /// Returns true if and only if this record corresponds to the end of a
+ /// range.
+ pub fn is_range_end(&self) -> bool {
+ self.name.starts_with('<')
+ && self.name.ends_with('>')
+ && self.name.contains("Last")
+ }
+}
+
+impl FromStr for UnicodeData {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<UnicodeData, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ ([A-Z0-9]+); # 1; codepoint
+ ([^;]+); # 2; name
+ ([^;]+); # 3; general category
+ ([0-9]+); # 4; canonical combining class
+ ([^;]+); # 5; bidi class
+ ([^;]*); # 6; decomposition
+ ([0-9]*); # 7; numeric type decimal
+ ([0-9]*); # 8; numeric type digit
+ ([-0-9/]*); # 9; numeric type numeric
+ ([YN]); # 10; bidi mirrored
+ ([^;]*); # 11; unicode1 name
+ ([^;]*); # 12; ISO comment
+ ([^;]*); # 13; simple uppercase mapping
+ ([^;]*); # 14; simple lowercase mapping
+ ([^;]*) # 15; simple titlecase mapping
+ $
+ "
+ )
+ .unwrap();
+ };
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid UnicodeData line"),
+ };
+ let capget = |n| caps.get(n).unwrap().as_str();
+ let mut data = UnicodeData::default();
+
+ data.codepoint = capget(1).parse()?;
+ data.name = capget(2).to_string();
+ data.general_category = capget(3).to_string();
+ data.canonical_combining_class = match capget(4).parse() {
+ Ok(n) => n,
+ Err(err) => {
+ return err!(
+ "failed to parse canonical combining class '{}': {}",
+ capget(4),
+ err
+ )
+ }
+ };
+ data.bidi_class = capget(5).to_string();
+ if !caps[6].is_empty() {
+ data.decomposition = caps[6].parse()?;
+ } else {
+ data.decomposition.push(data.codepoint)?;
+ }
+ if !capget(7).is_empty() {
+ data.numeric_type_decimal = Some(match capget(7).parse() {
+ Ok(n) => n,
+ Err(err) => {
+ return err!(
+ "failed to parse numeric type decimal '{}': {}",
+ capget(7),
+ err
+ )
+ }
+ });
+ }
+ if !capget(8).is_empty() {
+ data.numeric_type_digit = Some(match capget(8).parse() {
+ Ok(n) => n,
+ Err(err) => {
+ return err!(
+ "failed to parse numeric type digit '{}': {}",
+ capget(8),
+ err
+ )
+ }
+ });
+ }
+ if !capget(9).is_empty() {
+ data.numeric_type_numeric = Some(capget(9).parse()?);
+ }
+ data.bidi_mirrored = capget(10) == "Y";
+ data.unicode1_name = capget(11).to_string();
+ data.iso_comment = capget(12).to_string();
+ if !capget(13).is_empty() {
+ data.simple_uppercase_mapping = Some(capget(13).parse()?);
+ }
+ if !capget(14).is_empty() {
+ data.simple_lowercase_mapping = Some(capget(14).parse()?);
+ }
+ if !capget(15).is_empty() {
+ data.simple_titlecase_mapping = Some(capget(15).parse()?);
+ }
+ Ok(data)
+ }
+}
+
+impl fmt::Display for UnicodeData {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{};", self.codepoint)?;
+ write!(f, "{};", self.name)?;
+ write!(f, "{};", self.general_category)?;
+ write!(f, "{};", self.canonical_combining_class)?;
+ write!(f, "{};", self.bidi_class)?;
+ if self.decomposition.is_canonical()
+ && self.decomposition.mapping() == &[self.codepoint]
+ {
+ write!(f, ";")?;
+ } else {
+ write!(f, "{};", self.decomposition)?;
+ }
+ if let Some(n) = self.numeric_type_decimal {
+ write!(f, "{};", n)?;
+ } else {
+ write!(f, ";")?;
+ }
+ if let Some(n) = self.numeric_type_digit {
+ write!(f, "{};", n)?;
+ } else {
+ write!(f, ";")?;
+ }
+ if let Some(n) = self.numeric_type_numeric {
+ write!(f, "{};", n)?;
+ } else {
+ write!(f, ";")?;
+ }
+ write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
+ write!(f, "{};", self.unicode1_name)?;
+ write!(f, "{};", self.iso_comment)?;
+ if let Some(cp) = self.simple_uppercase_mapping {
+ write!(f, "{};", cp)?;
+ } else {
+ write!(f, ";")?;
+ }
+ if let Some(cp) = self.simple_lowercase_mapping {
+ write!(f, "{};", cp)?;
+ } else {
+ write!(f, ";")?;
+ }
+ if let Some(cp) = self.simple_titlecase_mapping {
+ write!(f, "{}", cp)?;
+ }
+ Ok(())
+ }
+}
+
+/// Represents a decomposition mapping of a single row in the
+/// `UnicodeData.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct UnicodeDataDecomposition {
+ /// The formatting tag associated with this mapping, if present.
+ pub tag: Option<UnicodeDataDecompositionTag>,
+ /// The number of codepoints in this mapping.
+ pub len: usize,
+ /// The codepoints in the mapping. Entries beyond `len` in the mapping
+ /// are always U+0000. If no mapping was present, then this always contains
+ /// a single codepoint corresponding to this row's character.
+ pub mapping: [Codepoint; 18],
+}
+
+impl UnicodeDataDecomposition {
+ /// Create a new decomposition mapping with the given tag and codepoints.
+ ///
+ /// If there are too many codepoints, then an error is returned.
+ pub fn new(
+ tag: Option<UnicodeDataDecompositionTag>,
+ mapping: &[Codepoint],
+ ) -> Result<UnicodeDataDecomposition, Error> {
+ let mut x = UnicodeDataDecomposition::default();
+ x.tag = tag;
+ for &cp in mapping {
+ x.push(cp)?;
+ }
+ Ok(x)
+ }
+
+ /// Add a new codepoint to this decomposition's mapping.
+ ///
+ /// If the mapping is already full, then this returns an error.
+ pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
+ if self.len >= self.mapping.len() {
+ return err!(
+ "invalid decomposition mapping (too many codepoints)"
+ );
+ }
+ self.mapping[self.len] = cp;
+ self.len += 1;
+ Ok(())
+ }
+
+ /// Return the mapping as a slice of codepoints. The slice returned
+ /// has length equivalent to the number of codepoints in this mapping.
+ pub fn mapping(&self) -> &[Codepoint] {
+ &self.mapping[..self.len]
+ }
+
+ /// Returns true if and only if this decomposition mapping is canonical.
+ pub fn is_canonical(&self) -> bool {
+ self.tag.is_none()
+ }
+}
+
+impl FromStr for UnicodeDataDecomposition {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
+ lazy_static! {
+ static ref WITH_TAG: Regex = Regex::new(
+ r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$"
+ )
+ .unwrap();
+ static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap();
+ };
+ if s.is_empty() {
+ return err!(
+ "expected non-empty string for \
+ UnicodeDataDecomposition value"
+ );
+ }
+ let caps = match WITH_TAG.captures(s) {
+ Some(caps) => caps,
+ None => return err!("invalid decomposition value"),
+ };
+ let mut decomp = UnicodeDataDecomposition::default();
+ let mut codepoints = s;
+ if let Some(m) = caps.name("tag") {
+ decomp.tag = Some(m.as_str().parse()?);
+ codepoints = &caps["chars"];
+ }
+ for m in CHARS.find_iter(codepoints) {
+ let cp = m.as_str().parse()?;
+ decomp.push(cp)?;
+ }
+ Ok(decomp)
+ }
+}
+
+impl fmt::Display for UnicodeDataDecomposition {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if let Some(ref tag) = self.tag {
+ write!(f, "<{}> ", tag)?;
+ }
+ let mut first = true;
+ for cp in self.mapping() {
+ if !first {
+ write!(f, " ")?;
+ }
+ first = false;
+ write!(f, "{}", cp)?;
+ }
+ Ok(())
+ }
+}
+
+/// The formatting tag on a decomposition mapping.
+///
+/// This is taken from
+/// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum UnicodeDataDecompositionTag {
+ /// <font>
+ Font,
+ /// <noBreak>
+ NoBreak,
+ /// <initial>
+ Initial,
+ /// <medial>
+ Medial,
+ /// <final>
+ Final,
+ /// <isolated>
+ Isolated,
+ /// <circle>
+ Circle,
+ /// <super>
+ Super,
+ /// <sub>
+ Sub,
+ /// <vertical>
+ Vertical,
+ /// <wide>
+ Wide,
+ /// <narrow>
+ Narrow,
+ /// <small>
+ Small,
+ /// <square>
+ Square,
+ /// <fraction>
+ Fraction,
+ /// <compat>
+ Compat,
+}
+
+impl FromStr for UnicodeDataDecompositionTag {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
+ use self::UnicodeDataDecompositionTag::*;
+ Ok(match s {
+ "font" => Font,
+ "noBreak" => NoBreak,
+ "initial" => Initial,
+ "medial" => Medial,
+ "final" => Final,
+ "isolated" => Isolated,
+ "circle" => Circle,
+ "super" => Super,
+ "sub" => Sub,
+ "vertical" => Vertical,
+ "wide" => Wide,
+ "narrow" => Narrow,
+ "small" => Small,
+ "square" => Square,
+ "fraction" => Fraction,
+ "compat" => Compat,
+ _ => return err!("invalid decomposition formatting tag: {}", s),
+ })
+ }
+}
+
+impl fmt::Display for UnicodeDataDecompositionTag {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::UnicodeDataDecompositionTag::*;
+ let s = match *self {
+ Font => "font",
+ NoBreak => "noBreak",
+ Initial => "initial",
+ Medial => "medial",
+ Final => "final",
+ Isolated => "isolated",
+ Circle => "circle",
+ Super => "super",
+ Sub => "sub",
+ Vertical => "vertical",
+ Wide => "wide",
+ Narrow => "narrow",
+ Small => "small",
+ Square => "square",
+ Fraction => "fraction",
+ Compat => "compat",
+ };
+ write!(f, "{}", s)
+ }
+}
+
+/// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
+///
+/// A numeric value can either be a signed integer or a rational number.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum UnicodeDataNumeric {
+ /// An integer.
+ Integer(i64),
+ /// A rational number. The first is the numerator and the latter is the
+ /// denominator.
+ Rational(i64, i64),
+}
+
+impl FromStr for UnicodeDataNumeric {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
+ if s.is_empty() {
+ return err!(
+ "expected non-empty string for UnicodeDataNumeric value"
+ );
+ }
+ if let Some(pos) = s.find('/') {
+ let (snum, sden) = (&s[..pos], &s[pos + 1..]);
+ let num = match snum.parse() {
+ Ok(num) => num,
+ Err(err) => {
+ return err!(
+ "invalid integer numerator '{}': {}",
+ snum,
+ err
+ );
+ }
+ };
+ let den = match sden.parse() {
+ Ok(den) => den,
+ Err(err) => {
+ return err!(
+ "invalid integer denominator '{}': {}",
+ sden,
+ err
+ );
+ }
+ };
+ Ok(UnicodeDataNumeric::Rational(num, den))
+ } else {
+ match s.parse() {
+ Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
+ Err(err) => {
+ return err!(
+ "invalid integer denominator '{}': {}",
+ s,
+ err
+ );
+ }
+ }
+ }
+ }
+}
+
+impl fmt::Display for UnicodeDataNumeric {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ UnicodeDataNumeric::Integer(n) => write!(f, "{}", n),
+ UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d),
+ }
+ }
+}
+
+/// An iterator adapter that expands rows in `UnicodeData.txt`.
+///
+/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
+/// represented. Instead, they are represented by a pair of rows, indicating
+/// a range of codepoints with the same properties. For example, the Hangul
+/// syllable codepoints are represented by these two rows:
+///
+/// ```ignore
+/// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+/// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+/// ```
+///
+/// This iterator will wrap any iterator of `UnicodeData` and, when a range of
+/// Unicode codepoints is found, it will be expanded to the appropriate
+/// sequence of `UnicodeData` values. Note that all such expanded records will
+/// have an empty name.
+pub struct UnicodeDataExpander<I: Iterator> {
+ /// The underlying iterator.
+ it: iter::Peekable<I>,
+ /// A range of codepoints to emit when we've found a pair. Otherwise,
+ /// `None`.
+ range: CodepointRange,
+}
+
+struct CodepointRange {
+ /// The codepoint range.
+ range: Range<u32>,
+ /// The start record. All subsequent records in this range are generated
+ /// by cloning this and updating the codepoint/name.
+ start_record: UnicodeData,
+}
+
+impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
+ /// Create a new iterator that expands pairs of `UnicodeData` range
+ /// records. All other records are passed through as-is.
+ pub fn new<T>(it: T) -> UnicodeDataExpander<I>
+ where
+ T: IntoIterator<IntoIter = I, Item = I::Item>,
+ {
+ UnicodeDataExpander {
+ it: it.into_iter().peekable(),
+ range: CodepointRange {
+ range: 0..0,
+ start_record: UnicodeData::default(),
+ },
+ }
+ }
+}
+
+impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
+ type Item = UnicodeData;
+
+ fn next(&mut self) -> Option<UnicodeData> {
+ if let Some(udata) = self.range.next() {
+ return Some(udata);
+ }
+ let row1 = match self.it.next() {
+ None => return None,
+ Some(row1) => row1,
+ };
+ if !row1.is_range_start()
+ || !self.it.peek().map_or(false, |row2| row2.is_range_end())
+ {
+ return Some(row1);
+ }
+ let row2 = self.it.next().unwrap();
+ self.range = CodepointRange {
+ range: row1.codepoint.value()..(row2.codepoint.value() + 1),
+ start_record: row1,
+ };
+ self.next()
+ }
+}
+
+impl Iterator for CodepointRange {
+ type Item = UnicodeData;
+
+ fn next(&mut self) -> Option<UnicodeData> {
+ let cp = match self.range.next() {
+ None => return None,
+ Some(cp) => cp,
+ };
+ Some(UnicodeData {
+ codepoint: Codepoint::from_u32(cp).unwrap(),
+ name: "".to_string(),
+ ..self.start_record.clone()
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::common::Codepoint;
+
+ use super::{
+ UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
+ UnicodeDataNumeric,
+ };
+
+ fn codepoint(n: u32) -> Codepoint {
+ Codepoint::from_u32(n).unwrap()
+ }
+
+ fn s(string: &str) -> String {
+ string.to_string()
+ }
+
+ #[test]
+ fn parse1() {
+ let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x249d),
+ name: s("PARENTHESIZED LATIN SMALL LETTER B"),
+ general_category: s("So"),
+ canonical_combining_class: 0,
+ bidi_class: s("L"),
+ decomposition: UnicodeDataDecomposition::new(
+ Some(UnicodeDataDecompositionTag::Compat),
+ &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: None,
+ bidi_mirrored: false,
+ unicode1_name: s(""),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: None,
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn parse2() {
+ let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x000D),
+ name: s("<control>"),
+ general_category: s("Cc"),
+ canonical_combining_class: 0,
+ bidi_class: s("B"),
+ decomposition: UnicodeDataDecomposition::new(
+ None,
+ &[codepoint(0x000D)]
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: None,
+ bidi_mirrored: false,
+ unicode1_name: s("CARRIAGE RETURN (CR)"),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: None,
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn parse3() {
+ let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x00BC),
+ name: s("VULGAR FRACTION ONE QUARTER"),
+ general_category: s("No"),
+ canonical_combining_class: 0,
+ bidi_class: s("ON"),
+ decomposition: UnicodeDataDecomposition::new(
+ Some(UnicodeDataDecompositionTag::Fraction),
+ &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
+ bidi_mirrored: false,
+ unicode1_name: s("FRACTION ONE QUARTER"),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: None,
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn parse4() {
+ let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x0041),
+ name: s("LATIN CAPITAL LETTER A"),
+ general_category: s("Lu"),
+ canonical_combining_class: 0,
+ bidi_class: s("L"),
+ decomposition: UnicodeDataDecomposition::new(
+ None,
+ &[codepoint(0x0041)]
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: None,
+ bidi_mirrored: false,
+ unicode1_name: s(""),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: Some(codepoint(0x0061)),
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn parse5() {
+ let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x0F33),
+ name: s("TIBETAN DIGIT HALF ZERO"),
+ general_category: s("No"),
+ canonical_combining_class: 0,
+ bidi_class: s("L"),
+ decomposition: UnicodeDataDecomposition::new(
+ None,
+ &[codepoint(0x0F33)]
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
+ -1, 2
+ )),
+ bidi_mirrored: false,
+ unicode1_name: s(""),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: None,
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn expander() {
+ use super::UnicodeDataExpander;
+ use crate::common::UcdLineParser;
+
+ let data = "\
+ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
+";
+ let records = UcdLineParser::new(None, data.as_bytes())
+ .collect::<Result<Vec<_>, _>>()
+ .unwrap();
+ assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
+ }
+}
diff --git a/vendor/ucd-parse/src/word_break.rs b/vendor/ucd-parse/src/word_break.rs
new file mode 100644
index 000000000..57d512667
--- /dev/null
+++ b/vendor/ucd-parse/src/word_break.rs
@@ -0,0 +1,103 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+ parse_break_test, parse_codepoint_association, CodepointIter, Codepoints,
+ UcdFile, UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `auxiliary/WordBreakProperty.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct WordBreak {
+ /// The codepoint or codepoint range for this entry.
+ pub codepoints: Codepoints,
+ /// The property value assigned to the codepoints in this entry.
+ pub value: String,
+}
+
+impl UcdFile for WordBreak {
+ fn relative_file_path() -> &'static Path {
+ Path::new("auxiliary/WordBreakProperty.txt")
+ }
+}
+
+impl UcdFileByCodepoint for WordBreak {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoints.into_iter()
+ }
+}
+
+impl FromStr for WordBreak {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<WordBreak, Error> {
+ let (codepoints, value) = parse_codepoint_association(line)?;
+ Ok(WordBreak { codepoints, value: value.to_string() })
+ }
+}
+
+/// A single row in the `auxiliary/WordBreakTest.txt` file.
+///
+/// This file defines tests for the word break algorithm.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct WordBreakTest {
+ /// Each string is a UTF-8 encoded group of codepoints that make up a
+ /// single word.
+ pub words: Vec<String>,
+ /// A human readable description of this test.
+ pub comment: String,
+}
+
+impl UcdFile for WordBreakTest {
+ fn relative_file_path() -> &'static Path {
+ Path::new("auxiliary/WordBreakTest.txt")
+ }
+}
+
+impl FromStr for WordBreakTest {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<WordBreakTest, Error> {
+ let (groups, comment) = parse_break_test(line)?;
+ Ok(WordBreakTest { words: groups, comment })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::{WordBreak, WordBreakTest};
+
+ #[test]
+ fn parse_single() {
+ let line = "0A83 ; Extend # Mc GUJARATI SIGN VISARGA\n";
+ let row: WordBreak = line.parse().unwrap();
+ assert_eq!(row.codepoints, 0x0A83);
+ assert_eq!(row.value, "Extend");
+ }
+
+ #[test]
+ fn parse_range() {
+ let line = "104A0..104A9 ; Numeric # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE\n";
+ let row: WordBreak = line.parse().unwrap();
+ assert_eq!(row.codepoints, (0x104A0, 0x104A9));
+ assert_eq!(row.value, "Numeric");
+ }
+
+ #[test]
+ fn parse_test() {
+ let line = "÷ 0031 ÷ 0027 × 0308 ÷ 0061 ÷ 0027 × 2060 ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]";
+
+ let row: WordBreakTest = line.parse().unwrap();
+ assert_eq!(
+ row.words,
+ vec![
+ "\u{0031}",
+ "\u{0027}\u{0308}",
+ "\u{0061}",
+ "\u{0027}\u{2060}",
+ ]
+ );
+ assert!(row.comment.contains("[4.0] COMBINING DIAERESIS (Extend_FE)"));
+ }
+}