diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/ucd-parse | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/ucd-parse')
27 files changed, 3642 insertions, 0 deletions
diff --git a/vendor/ucd-parse/.cargo-checksum.json b/vendor/ucd-parse/.cargo-checksum.json new file mode 100644 index 000000000..34cd1d5b5 --- /dev/null +++ b/vendor/ucd-parse/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"3a23e75f3807a38f86e8564a139135970f38c9ebc448749682b75fd4096f6d4a","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"5af803e482641f01332bde35cc8137211714b6f100122ec548c9712a09aead55","src/age.rs":"13a9a01b2373e9eff06b547543479394843cb9103c200b3e666ca5e408369bc9","src/arabic_shaping.rs":"31075e05b33248540f10ae5a3bb14715965e109b2be40cd9c0735810903ce29b","src/bidi_mirroring_glyph.rs":"945a110e0f54eabc2f48719893da300c11b4fd1f28265ab8f7b32ce2e5e3f6e5","src/case_folding.rs":"1ec85e0fa8e8cb0315974b843d168d9cddecad40efcf8ce78de677c0f0417f34","src/common.rs":"40782238affb569c9bd89a7ce19202677ba3e1da0bb5c8f8c4439adaa375858b","src/core_properties.rs":"24b261ed0bc4b7443734d843cda58433c7727914524ac4c3cc46fc153463e8cd","src/emoji_properties.rs":"bdb24a301661592d0956db2ad945a86778e0ad8f86cd82077835bb0d2a4f144c","src/error.rs":"6df32d4c5cc9819832083f465aa4ce11d26d3b44e37a9d4274a45fd8e1314903","src/grapheme_cluster_break.rs":"f63f75f1a5a82b698d4a840b063bc650f2b2f64429830dc338c9723bf1368e0b","src/jamo_short_name.rs":"02dc272c1a7d01de5e22737a76327b94ae2d132703dbc0657e3e887ceb1d1d91","src/lib.rs":"894ecd08e4588e14de69e8b9d25e9a38e9e2f73e260855c99df13c2ee1d825d3","src/line_break.rs":"1def7f73d44c1703fd18dbd9c9fc8dd76edabed27a5061564d6521d59335a95c","src/name_aliases.rs":"497629a0499d048b0f1615c619975f149c6a1d97361b7ff16850a8291796c90d","src/prop_list.rs":"856f00f51e7e6b9b0386a9b3941582eba63eb96896c86e58a791384a1235fdec","src/property_aliases.rs":"7b6da97e45a898499f29e30346f1b7aa6b7d758184a3bfa4f0b816d20edc9851","src/property_value_aliases.rs":"4e9fbad2b32ad636e5f8dfefa082352e444e4a68822a7786ea7d4217e7afd2fb","src/script_extensions.rs":"d967e213122702df642c975765fec28811ae8351f6f5307ca67989bf0b456fba","src/scripts.rs":"04740c080bb48e99d84622e4708215b40abdd387c70347d6b264b9c7fcbbac37","src/sentence_break.rs":"ac54a7f09f75694582904509d979c61784fa1ec647e4d531ea1b283bc3082635","src/special_casing.rs":"de7ed50ec34a222c73e8ad6d82a2a658b4475ce312301c5110d07fa13e51cb0b","src/unicode_data.rs":"cad99e17c6d56c9029416a0f3ec1b469786864eace2a20f212f2b4a1c96b59f1","src/word_break.rs":"eea514f238dc9dea82f52efc3154fde3f215b068dd201b22c31ef1c0acf1fba3"},"package":"5269f8d35df6b8b60758343a6d742ecf09e4bca13faee32af5503aebd1e11b7c"}
\ No newline at end of file diff --git a/vendor/ucd-parse/Cargo.toml b/vendor/ucd-parse/Cargo.toml new file mode 100644 index 000000000..f7efa0970 --- /dev/null +++ b/vendor/ucd-parse/Cargo.toml @@ -0,0 +1,31 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "ucd-parse" +version = "0.1.8" +authors = ["Andrew Gallant <jamslam@gmail.com>"] +description = "A library for parsing data files in the Unicode character database.\n" +homepage = "https://github.com/BurntSushi/ucd-generate" +documentation = "https://docs.rs/ucd-parse" +readme = "README.md" +keywords = ["unicode", "database", "character", "property"] +license = "MIT/Apache-2.0" +repository = "https://github.com/BurntSushi/ucd-generate" +[dependencies.lazy_static] +version = "1" + +[dependencies.regex] +version = "1" +features = ["std", "unicode"] +default-features = false diff --git a/vendor/ucd-parse/LICENSE-APACHE b/vendor/ucd-parse/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/vendor/ucd-parse/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/vendor/ucd-parse/LICENSE-MIT b/vendor/ucd-parse/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/vendor/ucd-parse/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/ucd-parse/README.md b/vendor/ucd-parse/README.md new file mode 100644 index 000000000..dc3f78dc6 --- /dev/null +++ b/vendor/ucd-parse/README.md @@ -0,0 +1,22 @@ +ucd-parse +========= +A library for parsing Unicode Character Database (UCD) files into structured +data. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ucd-generate.png)](https://travis-ci.org/BurntSushi/ucd-generate) +[![](http://meritbadge.herokuapp.com/ucd-generate)](https://crates.io/crates/ucd-parse) + + +### Documentation + +https://docs.rs/ucd-parse + + +### License + +This project is licensed under either of + * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or + http://www.apache.org/licenses/LICENSE-2.0) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or + http://opensource.org/licenses/MIT) +at your option. diff --git a/vendor/ucd-parse/src/age.rs b/vendor/ucd-parse/src/age.rs new file mode 100644 index 000000000..3c93f0707 --- /dev/null +++ b/vendor/ucd-parse/src/age.rs @@ -0,0 +1,59 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `DerivedAge.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Age { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The age assigned to the codepoints in this entry. + pub age: String, +} + +impl UcdFile for Age { + fn relative_file_path() -> &'static Path { + Path::new("DerivedAge.txt") + } +} + +impl UcdFileByCodepoint for Age { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for Age { + type Err = Error; + + fn from_str(line: &str) -> Result<Age, Error> { + let (codepoints, script) = parse_codepoint_association(line)?; + Ok(Age { codepoints, age: script.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::Age; + + #[test] + fn parse_single() { + let line = "2BD2 ; 10.0 # GROUP MARK\n"; + let row: Age = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x2BD2); + assert_eq!(row.age, "10.0"); + } + + #[test] + fn parse_range() { + let line = "11D0B..11D36 ; 10.0 # [44] MASARAM GONDI LETTER AU..MASARAM GONDI VOWEL SIGN VOCALIC R\n"; + let row: Age = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x11D0B, 0x11D36)); + assert_eq!(row.age, "10.0"); + } +} diff --git a/vendor/ucd-parse/src/arabic_shaping.rs b/vendor/ucd-parse/src/arabic_shaping.rs new file mode 100644 index 000000000..d1d942a82 --- /dev/null +++ b/vendor/ucd-parse/src/arabic_shaping.rs @@ -0,0 +1,184 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// Represents a single row in the `ArabicShaping.txt` file. +/// +/// The field names were taken from the header of ArabicShaping.txt. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct ArabicShaping { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// A short schematic name for the codepoint. + /// + /// The schematic name is descriptive of the shape, based as consistently as + /// possible on a name for the skeleton and then the diacritic marks applied + /// to the skeleton, if any. Note that this schematic name is considered a + /// comment, and does not constitute a formal property value. + pub schematic_name: String, + /// The "joining type" of this codepoint. + pub joining_type: JoiningType, + /// The "joining group" of this codepoint. + pub joining_group: String, +} + +/// The Joining_Type field read from ArabicShaping.txt +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum JoiningType { + RightJoining, + LeftJoining, + DualJoining, + JoinCausing, + NonJoining, + Transparent, +} + +impl JoiningType { + pub fn as_str(&self) -> &str { + match self { + JoiningType::RightJoining => "R", + JoiningType::LeftJoining => "L", + JoiningType::DualJoining => "D", + JoiningType::JoinCausing => "C", + JoiningType::NonJoining => "U", + JoiningType::Transparent => "T", + } + } +} + +impl Default for JoiningType { + fn default() -> JoiningType { + JoiningType::NonJoining + } +} + +impl FromStr for JoiningType { + type Err = Error; + + fn from_str(s: &str) -> Result<JoiningType, Error> { + match s { + "R" => Ok(JoiningType::RightJoining), + "L" => Ok(JoiningType::LeftJoining), + "D" => Ok(JoiningType::DualJoining), + "C" => Ok(JoiningType::JoinCausing), + "U" => Ok(JoiningType::NonJoining), + "T" => Ok(JoiningType::Transparent), + _ => err!( + "unrecognized joining type: '{}' \ + (must be one of R, L, D, C, U or T)", + s + ), + } + } +} + +impl UcdFile for ArabicShaping { + fn relative_file_path() -> &'static Path { + Path::new("ArabicShaping.txt") + } +} + +impl UcdFileByCodepoint for ArabicShaping { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for ArabicShaping { + type Err = Error; + + fn from_str(line: &str) -> Result<ArabicShaping, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P<codepoint>[A-F0-9]+)\s*; + \s*(?P<name>[^;]+)\s*; + \s*(?P<joining_type>[^;]+)\s*; + \s*(?P<joining_group>[^;]+) + $ + " + ) + .unwrap(); + }; + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid ArabicShaping line"), + }; + + Ok(ArabicShaping { + codepoint: caps["codepoint"].parse()?, + schematic_name: caps["name"].to_string(), + joining_type: caps["joining_type"].parse()?, + joining_group: caps["joining_group"].to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use crate::common::Codepoint; + + use super::{ArabicShaping, JoiningType}; + + fn codepoint(n: u32) -> Codepoint { + Codepoint::from_u32(n).unwrap() + } + + fn s(string: &str) -> String { + string.to_string() + } + + #[test] + fn parse1() { + let line = "0600; ARABIC NUMBER SIGN; U; No_Joining_Group\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x0600), + schematic_name: s("ARABIC NUMBER SIGN"), + joining_type: JoiningType::NonJoining, + joining_group: s("No_Joining_Group") + } + ); + } + + #[test] + fn parse2() { + let line = "063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x063D), + schematic_name: s("FARSI YEH WITH INVERTED V ABOVE"), + joining_type: JoiningType::DualJoining, + joining_group: s("FARSI YEH") + } + ); + } + + #[test] + fn parse3() { + let line = + "10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x10D23), + schematic_name: s( + "HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE" + ), + joining_type: JoiningType::DualJoining, + joining_group: s("HANIFI ROHINGYA KINNA YA") + } + ); + } +} diff --git a/vendor/ucd-parse/src/bidi_mirroring_glyph.rs b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs new file mode 100644 index 000000000..fcfefffcb --- /dev/null +++ b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs @@ -0,0 +1,107 @@ +use std::fmt; +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// Represents a single row in the `BidiMirroring.txt` file. +/// +/// The field names were taken from the header of BidiMirroring.txt. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct BidiMirroring { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The codepoint that has typically has a glyph that is the mirror image + /// of `codepoint`. + pub bidi_mirroring_glyph: Codepoint, +} + +impl UcdFile for BidiMirroring { + fn relative_file_path() -> &'static Path { + Path::new("BidiMirroring.txt") + } +} + +impl UcdFileByCodepoint for BidiMirroring { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for BidiMirroring { + type Err = Error; + + fn from_str(line: &str) -> Result<BidiMirroring, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P<codepoint>[A-F0-9]+)\s*; + \s*(?P<substitute_codepoint>[A-F0-9]+) + \s+ + \#(?:.+) + $ + " + ) + .unwrap(); + }; + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid BidiMirroring line"), + }; + + Ok(BidiMirroring { + codepoint: caps["codepoint"].parse()?, + bidi_mirroring_glyph: caps["substitute_codepoint"].parse()?, + }) + } +} + +impl fmt::Display for BidiMirroring { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{};", self.codepoint)?; + write!(f, "{};", self.bidi_mirroring_glyph)?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::common::Codepoint; + + use super::BidiMirroring; + + fn codepoint(n: u32) -> Codepoint { + Codepoint::from_u32(n).unwrap() + } + + #[test] + fn parse() { + let line = "0028; 0029 # LEFT PARENTHESIS\n"; + let data: BidiMirroring = line.parse().unwrap(); + assert_eq!( + data, + BidiMirroring { + codepoint: codepoint(0x0028), + bidi_mirroring_glyph: codepoint(0x0029), + } + ); + } + + #[test] + fn parse_best_fit() { + let line = "228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO\n"; + let data: BidiMirroring = line.parse().unwrap(); + assert_eq!( + data, + BidiMirroring { + codepoint: codepoint(0x228A), + bidi_mirroring_glyph: codepoint(0x228B), + } + ); + } +} diff --git a/vendor/ucd-parse/src/case_folding.rs b/vendor/ucd-parse/src/case_folding.rs new file mode 100644 index 000000000..813fc81a1 --- /dev/null +++ b/vendor/ucd-parse/src/case_folding.rs @@ -0,0 +1,161 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `CaseFolding.txt` file. +/// +/// The contents of `CaseFolding.txt` are a convenience derived from both +/// `UnicodeData.txt` and `SpecialCasing.txt`. +/// +/// Note that a single codepoint may be mapped multiple times. In particular, +/// a single codepoint might have distinct `CaseStatus::Simple` and +/// `CaseStatus::Full` mappings. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct CaseFold { + /// The codepoint that is being mapped. + pub codepoint: Codepoint, + /// The case status of this mapping. + pub status: CaseStatus, + /// The actual case mapping, which is more than one codepoint if this is + /// a "full" mapping. + pub mapping: Vec<Codepoint>, +} + +impl UcdFile for CaseFold { + fn relative_file_path() -> &'static Path { + Path::new("CaseFolding.txt") + } +} + +impl UcdFileByCodepoint for CaseFold { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for CaseFold { + type Err = Error; + + fn from_str(line: &str) -> Result<CaseFold, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P<codepoint>[^\s;]+)\s*; + \s*(?P<status>[^\s;]+)\s*; + \s*(?P<mapping>[^;]+)\s*; + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid CaseFolding line: '{}'", line), + }; + let mut mapping = vec![]; + for cp in caps["mapping"].split_whitespace() { + mapping.push(cp.parse()?); + } + Ok(CaseFold { + codepoint: caps["codepoint"].parse()?, + status: caps["status"].parse()?, + mapping, + }) + } +} + +/// The status of a particular case mapping. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum CaseStatus { + /// Case mappings shared by both "simple" and "full" mappings. + Common, + /// A case mapping that changes the number of codepoints. + Full, + /// A case mapping that doesn't change the number of codepoints, when it + /// differs from `Full`. + Simple, + /// Special cases (currently only for Turkic mappings) that are typically + /// excluded by default. Special cases don't change the number of + /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes. + Special, +} + +impl Default for CaseStatus { + fn default() -> CaseStatus { + CaseStatus::Common + } +} + +impl CaseStatus { + /// Returns true if and only if this status indicates a case mapping that + /// won't change the number of codepoints. + pub fn is_fixed(&self) -> bool { + *self != CaseStatus::Full + } +} + +impl FromStr for CaseStatus { + type Err = Error; + + fn from_str(s: &str) -> Result<CaseStatus, Error> { + match s { + "C" => Ok(CaseStatus::Common), + "F" => Ok(CaseStatus::Full), + "S" => Ok(CaseStatus::Simple), + "T" => Ok(CaseStatus::Special), + _ => err!( + "unrecognized case status: '{}' \ + (must be one of C, F, S or T)", + s + ), + } + } +} + +#[cfg(test)] +mod tests { + use super::{CaseFold, CaseStatus}; + + #[test] + fn parse_common() { + let line = + "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0150); + assert_eq!(row.status, CaseStatus::Common); + assert_eq!(row.mapping, vec![0x0151]); + } + + #[test] + fn parse_full() { + let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x03B0); + assert_eq!(row.status, CaseStatus::Full); + assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]); + } + + #[test] + fn parse_simple() { + let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x1F8F); + assert_eq!(row.status, CaseStatus::Simple); + assert_eq!(row.mapping, vec![0x1F87]); + } + + #[test] + fn parse_special() { + let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0049); + assert_eq!(row.status, CaseStatus::Special); + assert_eq!(row.mapping, vec![0x0131]); + } +} diff --git a/vendor/ucd-parse/src/common.rs b/vendor/ucd-parse/src/common.rs new file mode 100644 index 000000000..c18be668e --- /dev/null +++ b/vendor/ucd-parse/src/common.rs @@ -0,0 +1,594 @@ +use std::char; +use std::collections::BTreeMap; +use std::fmt; +use std::fs::File; +use std::io::{self, BufRead}; +use std::marker::PhantomData; +use std::path::{Path, PathBuf}; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::error::{Error, ErrorKind}; + +/// Parse a particular file in the UCD into a sequence of rows. +/// +/// The given directory should be the directory to the UCD. +pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error> +where + P: AsRef<Path>, + D: UcdFile, +{ + let mut xs = vec![]; + for result in D::from_dir(ucd_dir)? { + let x = result?; + xs.push(x); + } + Ok(xs) +} + +/// Parse a particular file in the UCD into a map from codepoint to the record. +/// +/// The given directory should be the directory to the UCD. +pub fn parse_by_codepoint<P, D>( + ucd_dir: P, +) -> Result<BTreeMap<Codepoint, D>, Error> +where + P: AsRef<Path>, + D: UcdFileByCodepoint, +{ + let mut map = BTreeMap::new(); + for result in D::from_dir(ucd_dir)? { + let x = result?; + for cp in x.codepoints() { + map.insert(cp, x.clone()); + } + } + Ok(map) +} + +/// Parse a particular file in the UCD into a map from codepoint to all +/// records associated with that codepoint. +/// +/// This is useful for files that have multiple records for each codepoint. +/// For example, the `NameAliases.txt` file lists multiple aliases for some +/// codepoints. +/// +/// The given directory should be the directory to the UCD. +pub fn parse_many_by_codepoint<P, D>( + ucd_dir: P, +) -> Result<BTreeMap<Codepoint, Vec<D>>, Error> +where + P: AsRef<Path>, + D: UcdFileByCodepoint, +{ + let mut map = BTreeMap::new(); + for result in D::from_dir(ucd_dir)? { + let x = result?; + for cp in x.codepoints() { + map.entry(cp).or_insert(vec![]).push(x.clone()); + } + } + Ok(map) +} + +/// Given a path pointing at the root of the `ucd_dir`, attempts to determine +/// it's unicode version. +/// +/// This just checks the readme and the very first line of PropList.txt -- in +/// practice this works for all versions of UCD since 4.1.0. +pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>( + ucd_dir: &D, +) -> Result<(u64, u64, u64), Error> { + // Avoid duplication from generic path parameter. + fn ucd_directory_version_inner( + ucd_dir: &Path, + ) -> Result<(u64, u64, u64), Error> { + lazy_static::lazy_static! { + static ref VERSION_RX: Regex = + Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap(); + } + + let proplist = ucd_dir.join("PropList.txt"); + let contents = first_line(&proplist)?; + let caps = match VERSION_RX.captures(&contents) { + Some(c) => c, + None => { + return err!("Failed to find version in line {:?}", contents) + } + }; + + let capture_to_num = |n| { + caps.get(n).unwrap().as_str().parse::<u64>().map_err(|e| Error { + kind: ErrorKind::Parse(format!( + "Failed to parse version from {:?} in PropList.txt: {}", + contents, e + )), + line: Some(0), + path: Some(proplist.clone()), + }) + }; + let major = capture_to_num(1)?; + let minor = capture_to_num(2)?; + let patch = capture_to_num(3)?; + + Ok((major, minor, patch)) + } + ucd_directory_version_inner(ucd_dir.as_ref()) +} + +fn first_line(path: &Path) -> Result<String, Error> { + let file = std::fs::File::open(path).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.into()), + })?; + + let mut reader = std::io::BufReader::new(file); + let mut line_contents = String::new(); + reader.read_line(&mut line_contents).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.into()), + })?; + Ok(line_contents) +} + +/// A helper function for parsing a common record format that associates one +/// or more codepoints with a string value. +pub fn parse_codepoint_association<'a>( + line: &'a str, +) -> Result<(Codepoints, &'a str), Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P<codepoints>[^\s;]+)\s*; + \s*(?P<property>[^;\x23]+)\s* + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid PropList line: '{}'", line), + }; + let property = match caps.name("property") { + Some(property) => property.as_str().trim(), + None => { + return err!( + "could not find property name in PropList line: '{}'", + line + ) + } + }; + Ok((caps["codepoints"].parse()?, property)) +} + +/// A helper function for parsing a sequence of space separated codepoints. +/// The sequence is permitted to be empty. +pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> { + let mut cps = vec![]; + for cp in s.trim().split_whitespace() { + cps.push(cp.parse()?); + } + Ok(cps) +} + +/// A helper function for parsing a single test for the various break +/// algorithms. +/// +/// Upon success, this returns the UTF-8 encoded groups of codepoints along +/// with the comment associated with the test. The comment is a human readable +/// description of the test that may prove useful for debugging. +pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?:÷|×) + (?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+) + \s+ + \#(?P<comment>.+) + $ + " + ) + .unwrap(); + static ref GROUP: Regex = Regex::new( + r"(?x) + (?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷|×) + " + ) + .unwrap(); + } + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid break test line: '{}'", line), + }; + let comment = caps["comment"].trim().to_string(); + + let mut groups = vec![]; + let mut cur = String::new(); + for cap in GROUP.captures_iter(&caps["groups"]) { + let cp: Codepoint = cap["codepoint"].parse()?; + let ch = match cp.scalar() { + Some(ch) => ch, + None => { + return err!( + "invalid codepoint '{:X}' in line: '{}'", + cp.value(), + line + ) + } + }; + cur.push(ch); + if &cap["kind"] == "÷" { + groups.push(cur); + cur = String::new(); + } + } + Ok((groups, comment)) +} + +/// Describes a single UCD file. +pub trait UcdFile: + Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq +{ + /// The file path corresponding to this file, relative to the UCD + /// directory. + fn relative_file_path() -> &'static Path; + + /// The full file path corresponding to this file given the UCD directory + /// path. + fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf { + ucd_dir.as_ref().join(Self::relative_file_path()) + } + + /// Create an iterator over each record in this UCD file. + /// + /// The parameter should correspond to the directory containing the UCD. + fn from_dir<P: AsRef<Path>>( + ucd_dir: P, + ) -> Result<UcdLineParser<File, Self>, Error> { + UcdLineParser::from_path(Self::file_path(ucd_dir)) + } +} + +/// Describes a single UCD file where every record in the file is associated +/// with one or more codepoints. +pub trait UcdFileByCodepoint: UcdFile { + /// Returns the codepoints associated with this record. + fn codepoints(&self) -> CodepointIter; +} + +/// A line oriented parser for a particular UCD file. +/// +/// Callers can build a line parser via the +/// [`UcdFile::from_dir`](trait.UcdFile.html) method. +/// +/// The `R` type parameter refers to the underlying `io::Read` implementation +/// from which the UCD data is read. +/// +/// The `D` type parameter refers to the type of the record parsed out of each +/// line. +#[derive(Debug)] +pub struct UcdLineParser<R, D> { + path: Option<PathBuf>, + rdr: io::BufReader<R>, + line: String, + line_number: u64, + _data: PhantomData<D>, +} + +impl<D> UcdLineParser<File, D> { + /// Create a new parser from the given file path. + pub(crate) fn from_path<P: AsRef<Path>>( + path: P, + ) -> Result<UcdLineParser<File, D>, Error> { + let path = path.as_ref(); + let file = File::open(path).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.to_path_buf()), + })?; + Ok(UcdLineParser::new(Some(path.to_path_buf()), file)) + } +} + +impl<R: io::Read, D> UcdLineParser<R, D> { + /// Create a new parser that parses the reader given. + /// + /// The type of data parsed is determined when the `parse_next` function + /// is called by virtue of the type requested. + /// + /// Note that the reader is buffered internally, so the caller does not + /// need to provide their own buffering. + pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> { + UcdLineParser { + path, + rdr: io::BufReader::new(rdr), + line: String::new(), + line_number: 0, + _data: PhantomData, + } + } +} + +impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> { + type Item = Result<D, Error>; + + fn next(&mut self) -> Option<Result<D, Error>> { + loop { + self.line_number += 1; + self.line.clear(); + let n = match self.rdr.read_line(&mut self.line) { + Err(err) => { + return Some(Err(Error { + kind: ErrorKind::Io(err), + line: None, + path: self.path.clone(), + })) + } + Ok(n) => n, + }; + if n == 0 { + return None; + } + if !self.line.starts_with('#') && !self.line.trim().is_empty() { + break; + } + } + let line_number = self.line_number; + Some(self.line.parse().map_err(|mut err: Error| { + err.line = Some(line_number); + err + })) + } +} + +/// A representation of either a single codepoint or a range of codepoints. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub enum Codepoints { + /// A single codepoint. + Single(Codepoint), + /// A range of codepoints. + Range(CodepointRange), +} + +impl Default for Codepoints { + fn default() -> Codepoints { + Codepoints::Single(Codepoint::default()) + } +} + +impl IntoIterator for Codepoints { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + match self { + Codepoints::Single(x) => x.into_iter(), + Codepoints::Range(x) => x.into_iter(), + } + } +} + +impl FromStr for Codepoints { + type Err = Error; + + fn from_str(s: &str) -> Result<Codepoints, Error> { + if s.contains("..") { + CodepointRange::from_str(s).map(Codepoints::Range) + } else { + Codepoint::from_str(s).map(Codepoints::Single) + } + } +} + +impl fmt::Display for Codepoints { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Codepoints::Single(ref x) => x.fmt(f), + Codepoints::Range(ref x) => x.fmt(f), + } + } +} + +impl PartialEq<u32> for Codepoints { + fn eq(&self, other: &u32) -> bool { + match *self { + Codepoints::Single(ref x) => x == other, + Codepoints::Range(ref x) => x == &(*other, *other), + } + } +} + +impl PartialEq<Codepoint> for Codepoints { + fn eq(&self, other: &Codepoint) -> bool { + match *self { + Codepoints::Single(ref x) => x == other, + Codepoints::Range(ref x) => x == &(*other, *other), + } + } +} + +impl PartialEq<(u32, u32)> for Codepoints { + fn eq(&self, other: &(u32, u32)) -> bool { + match *self { + Codepoints::Single(ref x) => &(x.value(), x.value()) == other, + Codepoints::Range(ref x) => x == other, + } + } +} + +impl PartialEq<(Codepoint, Codepoint)> for Codepoints { + fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { + match *self { + Codepoints::Single(ref x) => &(*x, *x) == other, + Codepoints::Range(ref x) => x == other, + } + } +} + +/// A range of Unicode codepoints. The range is inclusive; both ends of the +/// range are guaranteed to be valid codepoints. +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub struct CodepointRange { + /// The start of the codepoint range. + pub start: Codepoint, + /// The end of the codepoint range. + pub end: Codepoint, +} + +impl IntoIterator for CodepointRange { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + CodepointIter { next: self.start.value(), range: self } + } +} + +impl FromStr for CodepointRange { + type Err = Error; + + fn from_str(s: &str) -> Result<CodepointRange, Error> { + lazy_static! { + static ref PARTS: Regex = + Regex::new(r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$") + .unwrap(); + } + let caps = match PARTS.captures(s) { + Some(caps) => caps, + None => return err!("invalid codepoint range: '{}'", s), + }; + let start = caps["start"].parse().or_else(|err| { + err!("failed to parse '{}' as a codepoint range: {}", s, err) + })?; + let end = caps["end"].parse().or_else(|err| { + err!("failed to parse '{}' as a codepoint range: {}", s, err) + })?; + Ok(CodepointRange { start, end }) + } +} + +impl fmt::Display for CodepointRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +impl PartialEq<(u32, u32)> for CodepointRange { + fn eq(&self, other: &(u32, u32)) -> bool { + &(self.start.value(), self.end.value()) == other + } +} + +impl PartialEq<(Codepoint, Codepoint)> for CodepointRange { + fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { + &(self.start, self.end) == other + } +} + +/// A single Unicode codepoint. +/// +/// This type's string representation is a hexadecimal number. It is guaranteed +/// to be in the range `[0, 10FFFF]`. +/// +/// Note that unlike Rust's `char` type, this may be a surrogate codepoint. +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub struct Codepoint(u32); + +impl Codepoint { + /// Create a new codepoint from a `u32`. + /// + /// If the given number is not a valid codepoint, then this returns an + /// error. + pub fn from_u32(n: u32) -> Result<Codepoint, Error> { + if n > 0x10FFFF { + err!("{:x} is not a valid Unicode codepoint", n) + } else { + Ok(Codepoint(n)) + } + } + + /// Return the underlying `u32` codepoint value. + pub fn value(self) -> u32 { + self.0 + } + + /// Attempt to convert this codepoint to a Unicode scalar value. + /// + /// If this is a surrogate codepoint, then this returns `None`. + pub fn scalar(self) -> Option<char> { + char::from_u32(self.0) + } +} + +impl IntoIterator for Codepoint { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + let range = CodepointRange { start: self, end: self }; + CodepointIter { next: self.value(), range } + } +} + +impl FromStr for Codepoint { + type Err = Error; + + fn from_str(s: &str) -> Result<Codepoint, Error> { + match u32::from_str_radix(s, 16) { + Ok(n) => Codepoint::from_u32(n), + Err(err) => { + return err!( + "failed to parse '{}' as a hexadecimal codepoint: {}", + s, + err + ); + } + } + } +} + +impl fmt::Display for Codepoint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:04X}", self.0) + } +} + +impl PartialEq<u32> for Codepoint { + fn eq(&self, other: &u32) -> bool { + self.0 == *other + } +} + +impl PartialEq<Codepoint> for u32 { + fn eq(&self, other: &Codepoint) -> bool { + *self == other.0 + } +} + +/// An iterator over a range of Unicode codepoints. +#[derive(Debug)] +pub struct CodepointIter { + next: u32, + range: CodepointRange, +} + +impl Iterator for CodepointIter { + type Item = Codepoint; + + fn next(&mut self) -> Option<Codepoint> { + if self.next > self.range.end.value() { + return None; + } + let current = self.next; + self.next += 1; + Some(Codepoint::from_u32(current).unwrap()) + } +} diff --git a/vendor/ucd-parse/src/core_properties.rs b/vendor/ucd-parse/src/core_properties.rs new file mode 100644 index 000000000..9a7682b43 --- /dev/null +++ b/vendor/ucd-parse/src/core_properties.rs @@ -0,0 +1,60 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `DerivedCoreProperties.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct CoreProperty { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property name assigned to the codepoints in this entry. + pub property: String, +} + +impl UcdFile for CoreProperty { + fn relative_file_path() -> &'static Path { + Path::new("DerivedCoreProperties.txt") + } +} + +impl UcdFileByCodepoint for CoreProperty { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for CoreProperty { + type Err = Error; + + fn from_str(line: &str) -> Result<CoreProperty, Error> { + let (codepoints, property) = parse_codepoint_association(line)?; + Ok(CoreProperty { codepoints, property: property.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::CoreProperty; + + #[test] + fn parse_single() { + let line = + "1163D ; Case_Ignorable # Mn MODI SIGN ANUSVARA\n"; + let row: CoreProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x1163D); + assert_eq!(row.property, "Case_Ignorable"); + } + + #[test] + fn parse_range() { + let line = "11133..11134 ; Grapheme_Link # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA\n"; + let row: CoreProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x11133, 0x11134)); + assert_eq!(row.property, "Grapheme_Link"); + } +} diff --git a/vendor/ucd-parse/src/emoji_properties.rs b/vendor/ucd-parse/src/emoji_properties.rs new file mode 100644 index 000000000..dc5c0c884 --- /dev/null +++ b/vendor/ucd-parse/src/emoji_properties.rs @@ -0,0 +1,86 @@ +use std::path::{Path, PathBuf}; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `emoji-data.txt` file. +/// +/// The `emoji-data.txt` file is the source of truth on several Emoji-related +/// Unicode properties. +/// +/// Note that `emoji-data.txt` is not formally part of the Unicode Character +/// Database. You can download the Emoji data files separately here: +/// https://unicode.org/Public/emoji/ +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct EmojiProperty { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property name assigned to the codepoints in this entry. + pub property: String, +} + +impl UcdFile for EmojiProperty { + fn relative_file_path() -> &'static Path { + Path::new("emoji/emoji-data.txt") + } + + fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf { + let ucd_dir = ucd_dir.as_ref(); + // The standard location, but only on UCDs from 13.0.0 and up. + let std = ucd_dir.join(Self::relative_file_path()); + if std.exists() { + std + } else { + // If the old location does exist, use it. + let legacy = ucd_dir.join("emoji-data.txt"); + if legacy.exists() { + legacy + } else { + // This might end up in an error message, so use the standard + // one if forced to choose. Arguably we could do something like + // peek + std + } + } + } +} + +impl UcdFileByCodepoint for EmojiProperty { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for EmojiProperty { + type Err = Error; + + fn from_str(line: &str) -> Result<EmojiProperty, Error> { + let (codepoints, property) = parse_codepoint_association(line)?; + Ok(EmojiProperty { codepoints, property: property.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::EmojiProperty; + + #[test] + fn parse_single() { + let line = "24C2 ; Emoji # 1.1 [1] (Ⓜ️) circled M\n"; + let row: EmojiProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x24C2); + assert_eq!(row.property, "Emoji"); + } + + #[test] + fn parse_range() { + let line = "1FA6E..1FFFD ; Extended_Pictographic# NA[1424] (️..️) <reserved-1FA6E>..<reserved-1FFFD>\n"; + let row: EmojiProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x1FA6E, 0x1FFFD)); + assert_eq!(row.property, "Extended_Pictographic"); + } +} diff --git a/vendor/ucd-parse/src/error.rs b/vendor/ucd-parse/src/error.rs new file mode 100644 index 000000000..9dafc4b33 --- /dev/null +++ b/vendor/ucd-parse/src/error.rs @@ -0,0 +1,86 @@ +use std::error; +use std::fmt; +use std::io; +use std::path::{Path, PathBuf}; + +/// Represents any kind of error that can occur while parsing the UCD. +#[derive(Debug)] +pub struct Error { + pub(crate) kind: ErrorKind, + pub(crate) line: Option<u64>, + pub(crate) path: Option<PathBuf>, +} + +/// The kind of error that occurred while parsing the UCD. +#[derive(Debug)] +pub enum ErrorKind { + /// An I/O error. + Io(io::Error), + /// A generic parse error. + Parse(String), +} + +impl Error { + /// Create a new parse error from the given message. + pub(crate) fn parse(msg: String) -> Error { + Error { kind: ErrorKind::Parse(msg), line: None, path: None } + } + + /// Return the specific kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// Return the line number at which this error occurred, if available. + pub fn line(&self) -> Option<u64> { + self.line + } + + /// Return the file path associated with this error, if one exists. + pub fn path(&self) -> Option<&Path> { + self.path.as_ref().map(|p| &**p) + } + + /// Unwrap this error into its underlying kind. + pub fn into_kind(self) -> ErrorKind { + self.kind + } + + /// Returns true if and only if this is an I/O error. + /// + /// If this returns true, the underlying `ErrorKind` is guaranteed to be + /// `ErrorKind::Io`. + pub fn is_io_error(&self) -> bool { + match self.kind { + ErrorKind::Io(_) => true, + _ => false, + } + } +} + +impl error::Error for Error { + fn cause(&self) -> Option<&dyn error::Error> { + match self.kind { + ErrorKind::Io(ref err) => Some(err), + _ => None, + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(ref path) = self.path { + if let Some(line) = self.line { + write!(f, "{}:{}: ", path.display(), line)?; + } else { + write!(f, "{}: ", path.display())?; + } + } else if let Some(line) = self.line { + write!(f, "error on line {}: ", line)?; + } + match self.kind { + ErrorKind::Io(ref err) => write!(f, "{}", err), + ErrorKind::Parse(ref msg) => write!(f, "{}", msg), + } + } +} diff --git a/vendor/ucd-parse/src/grapheme_cluster_break.rs b/vendor/ucd-parse/src/grapheme_cluster_break.rs new file mode 100644 index 000000000..9dbf32f41 --- /dev/null +++ b/vendor/ucd-parse/src/grapheme_cluster_break.rs @@ -0,0 +1,98 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_break_test, parse_codepoint_association, CodepointIter, Codepoints, + UcdFile, UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `auxiliary/GraphemeBreakProperty.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct GraphemeClusterBreak { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property value assigned to the codepoints in this entry. + pub value: String, +} + +impl UcdFile for GraphemeClusterBreak { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/GraphemeBreakProperty.txt") + } +} + +impl UcdFileByCodepoint for GraphemeClusterBreak { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for GraphemeClusterBreak { + type Err = Error; + + fn from_str(line: &str) -> Result<GraphemeClusterBreak, Error> { + let (codepoints, value) = parse_codepoint_association(line)?; + Ok(GraphemeClusterBreak { codepoints, value: value.to_string() }) + } +} + +/// A single row in the `auxiliary/GraphemeBreakTest.txt` file. +/// +/// This file defines tests for the grapheme cluster break algorithm. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct GraphemeClusterBreakTest { + /// Each string is a UTF-8 encoded group of codepoints that make up a + /// single grapheme cluster. + pub grapheme_clusters: Vec<String>, + /// A human readable description of this test. + pub comment: String, +} + +impl UcdFile for GraphemeClusterBreakTest { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/GraphemeBreakTest.txt") + } +} + +impl FromStr for GraphemeClusterBreakTest { + type Err = Error; + + fn from_str(line: &str) -> Result<GraphemeClusterBreakTest, Error> { + let (groups, comment) = parse_break_test(line)?; + Ok(GraphemeClusterBreakTest { grapheme_clusters: groups, comment }) + } +} + +#[cfg(test)] +mod tests { + use super::{GraphemeClusterBreak, GraphemeClusterBreakTest}; + + #[test] + fn parse_single() { + let line = "093B ; SpacingMark # Mc DEVANAGARI VOWEL SIGN OOE\n"; + let row: GraphemeClusterBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x093B); + assert_eq!(row.value, "SpacingMark"); + } + + #[test] + fn parse_range() { + let line = "1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z\n"; + let row: GraphemeClusterBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x1F1E6, 0x1F1FF)); + assert_eq!(row.value, "Regional_Indicator"); + } + + #[test] + fn parse_test() { + let line = "÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]\n"; + + let row: GraphemeClusterBreakTest = line.parse().unwrap(); + assert_eq!( + row.grapheme_clusters, + vec!["\u{0061}\u{1F3FF}", "\u{1F476}\u{200D}\u{1F6D1}",] + ); + assert!(row.comment.starts_with("÷ [0.2] LATIN SMALL LETTER A")); + } +} diff --git a/vendor/ucd-parse/src/jamo_short_name.rs b/vendor/ucd-parse/src/jamo_short_name.rs new file mode 100644 index 000000000..4103dd7ee --- /dev/null +++ b/vendor/ucd-parse/src/jamo_short_name.rs @@ -0,0 +1,80 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `Jamo.txt` file. +/// +/// The `Jamo.txt` file defines the `Jamo_Short_Name` property. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct JamoShortName { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The actual "Jamo Short Name." This string contains at most 3 bytes and + /// may be empty. + pub name: String, +} + +impl UcdFile for JamoShortName { + fn relative_file_path() -> &'static Path { + Path::new("Jamo.txt") + } +} + +impl UcdFileByCodepoint for JamoShortName { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for JamoShortName { + type Err = Error; + + fn from_str(line: &str) -> Result<JamoShortName, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?P<codepoint>[A-Z0-9]+); + \s* + (?P<name>[A-Z]*) + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid Jamo_Short_name line"), + }; + Ok(JamoShortName { + codepoint: caps["codepoint"].parse()?, + name: caps.name("name").unwrap().as_str().to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::JamoShortName; + + #[test] + fn parse1() { + let line = "1164; YAE # HANGUL JUNGSEONG YAE\n"; + let row: JamoShortName = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x1164); + assert_eq!(row.name, "YAE"); + } + + #[test] + fn parse2() { + let line = "110B; # HANGUL CHOSEONG IEUNG\n"; + let row: JamoShortName = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x110B); + assert_eq!(row.name, ""); + } +} diff --git a/vendor/ucd-parse/src/lib.rs b/vendor/ucd-parse/src/lib.rs new file mode 100644 index 000000000..f6654658a --- /dev/null +++ b/vendor/ucd-parse/src/lib.rs @@ -0,0 +1,66 @@ +/*! +A library for parsing the Unicode character database. +*/ + +#![deny(missing_docs)] + +pub use crate::common::{ + parse, parse_by_codepoint, parse_many_by_codepoint, ucd_directory_version, + Codepoint, CodepointIter, CodepointRange, Codepoints, UcdFile, + UcdFileByCodepoint, UcdLineParser, +}; +pub use crate::error::{Error, ErrorKind}; + +pub use crate::age::Age; +pub use crate::arabic_shaping::ArabicShaping; +pub use crate::bidi_mirroring_glyph::BidiMirroring; +pub use crate::case_folding::{CaseFold, CaseStatus}; +pub use crate::core_properties::CoreProperty; +pub use crate::emoji_properties::EmojiProperty; +pub use crate::grapheme_cluster_break::{ + GraphemeClusterBreak, GraphemeClusterBreakTest, +}; +pub use crate::jamo_short_name::JamoShortName; +pub use crate::line_break::LineBreakTest; +pub use crate::name_aliases::{NameAlias, NameAliasLabel}; +pub use crate::prop_list::Property; +pub use crate::property_aliases::PropertyAlias; +pub use crate::property_value_aliases::PropertyValueAlias; +pub use crate::script_extensions::ScriptExtension; +pub use crate::scripts::Script; +pub use crate::sentence_break::{SentenceBreak, SentenceBreakTest}; +pub use crate::special_casing::SpecialCaseMapping; +pub use crate::unicode_data::{ + UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, + UnicodeDataExpander, UnicodeDataNumeric, +}; +pub use crate::word_break::{WordBreak, WordBreakTest}; + +macro_rules! err { + ($($tt:tt)*) => { + Err(crate::error::Error::parse(format!($($tt)*))) + } +} + +mod common; +mod error; + +mod age; +mod arabic_shaping; +mod bidi_mirroring_glyph; +mod case_folding; +mod core_properties; +mod emoji_properties; +mod grapheme_cluster_break; +mod jamo_short_name; +mod line_break; +mod name_aliases; +mod prop_list; +mod property_aliases; +mod property_value_aliases; +mod script_extensions; +mod scripts; +mod sentence_break; +mod special_casing; +mod unicode_data; +mod word_break; diff --git a/vendor/ucd-parse/src/line_break.rs b/vendor/ucd-parse/src/line_break.rs new file mode 100644 index 000000000..aa62fcb9e --- /dev/null +++ b/vendor/ucd-parse/src/line_break.rs @@ -0,0 +1,49 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{parse_break_test, UcdFile}; +use crate::error::Error; + +/// A single row in the `auxiliary/LineBreakTest.txt` file. +/// +/// This file defines tests for the line break algorithm. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct LineBreakTest { + /// Each string is a UTF-8 encoded group of codepoints that make up a + /// single line. + pub lines: Vec<String>, + /// A human readable description of this test. + pub comment: String, +} + +impl UcdFile for LineBreakTest { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/LineBreakTest.txt") + } +} + +impl FromStr for LineBreakTest { + type Err = Error; + + fn from_str(line: &str) -> Result<LineBreakTest, Error> { + let (groups, comment) = parse_break_test(line)?; + Ok(LineBreakTest { lines: groups, comment }) + } +} + +#[cfg(test)] +mod tests { + use super::LineBreakTest; + + #[test] + fn parse_test() { + let line = "× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [30.13] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]"; + + let row: LineBreakTest = line.parse().unwrap(); + assert_eq!( + row.lines, + vec!["\u{1F1F7}\u{1F1FA}", "\u{1F1F8}\u{1F1EA}",] + ); + assert!(row.comment.ends_with("(RI) ÷ [0.3]")); + } +} diff --git a/vendor/ucd-parse/src/name_aliases.rs b/vendor/ucd-parse/src/name_aliases.rs new file mode 100644 index 000000000..36c9c4b01 --- /dev/null +++ b/vendor/ucd-parse/src/name_aliases.rs @@ -0,0 +1,145 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `NameAliases.txt` file. +/// +/// Note that there are multiple rows for some codepoint. Each row provides a +/// new alias. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct NameAlias { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The alias. + pub alias: String, + /// The label of this alias. + pub label: NameAliasLabel, +} + +impl UcdFile for NameAlias { + fn relative_file_path() -> &'static Path { + Path::new("NameAliases.txt") + } +} + +impl UcdFileByCodepoint for NameAlias { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for NameAlias { + type Err = Error; + + fn from_str(line: &str) -> Result<NameAlias, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?P<codepoint>[A-Z0-9]+); + \s* + (?P<alias>[^;]+); + \s* + (?P<label>\S+) + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid NameAliases line"), + }; + Ok(NameAlias { + codepoint: caps["codepoint"].parse()?, + alias: caps.name("alias").unwrap().as_str().to_string(), + label: caps["label"].parse()?, + }) + } +} + +/// The label of a name alias. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum NameAliasLabel { + /// Corrections for serious problems in a character name. + Correction, + /// ISO 6429 names for C0 and C1 control functions and other commonly + /// occurring names for control codes. + Control, + /// A few widely used alternate names for format characters. + Alternate, + /// Several documented labels for C1 control code points which were + /// never actually approved in any standard. + Figment, + /// Commonly occurring abbreviations (or acronyms) for control codes, + /// format characters, spaces and variation selectors. + Abbreviation, +} + +impl Default for NameAliasLabel { + fn default() -> NameAliasLabel { + // This is arbitrary, but the Default impl is convenient. + NameAliasLabel::Correction + } +} + +impl FromStr for NameAliasLabel { + type Err = Error; + + fn from_str(s: &str) -> Result<NameAliasLabel, Error> { + match s { + "correction" => Ok(NameAliasLabel::Correction), + "control" => Ok(NameAliasLabel::Control), + "alternate" => Ok(NameAliasLabel::Alternate), + "figment" => Ok(NameAliasLabel::Figment), + "abbreviation" => Ok(NameAliasLabel::Abbreviation), + unknown => err!("unknown name alias label: '{}'", unknown), + } + } +} + +#[cfg(test)] +mod tests { + use super::{NameAlias, NameAliasLabel}; + + #[test] + fn parse1() { + let line = "0000;NULL;control\n"; + let row: NameAlias = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0); + assert_eq!(row.alias, "NULL"); + assert_eq!(row.label, NameAliasLabel::Control); + } + + #[test] + fn parse2() { + let line = "000B;VERTICAL TABULATION;control\n"; + let row: NameAlias = line.parse().unwrap(); + assert_eq!(row.codepoint, 0xB); + assert_eq!(row.alias, "VERTICAL TABULATION"); + assert_eq!(row.label, NameAliasLabel::Control); + } + + #[test] + fn parse3() { + let line = "0081;HIGH OCTET PRESET;figment\n"; + let row: NameAlias = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x81); + assert_eq!(row.alias, "HIGH OCTET PRESET"); + assert_eq!(row.label, NameAliasLabel::Figment); + } + + #[test] + fn parse4() { + let line = "E01EF;VS256;abbreviation\n"; + let row: NameAlias = line.parse().unwrap(); + assert_eq!(row.codepoint, 0xE01EF); + assert_eq!(row.alias, "VS256"); + assert_eq!(row.label, NameAliasLabel::Abbreviation); + } +} diff --git a/vendor/ucd-parse/src/prop_list.rs b/vendor/ucd-parse/src/prop_list.rs new file mode 100644 index 000000000..db830c57a --- /dev/null +++ b/vendor/ucd-parse/src/prop_list.rs @@ -0,0 +1,63 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `PropList.txt` file. +/// +/// The `PropList.txt` file is the source of truth on several Unicode +/// properties. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Property { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property name assigned to the codepoints in this entry. + pub property: String, +} + +impl UcdFile for Property { + fn relative_file_path() -> &'static Path { + Path::new("PropList.txt") + } +} + +impl UcdFileByCodepoint for Property { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for Property { + type Err = Error; + + fn from_str(line: &str) -> Result<Property, Error> { + let (codepoints, property) = parse_codepoint_association(line)?; + Ok(Property { codepoints, property: property.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::Property; + + #[test] + fn parse_single() { + let line = + "061C ; Bidi_Control # Cf ARABIC LETTER MARK\n"; + let row: Property = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x061C); + assert_eq!(row.property, "Bidi_Control"); + } + + #[test] + fn parse_range() { + let line = "0009..000D ; White_Space # Cc [5] <control-0009>..<control-000D>\n"; + let row: Property = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x0009, 0x000D)); + assert_eq!(row.property, "White_Space"); + } +} diff --git a/vendor/ucd-parse/src/property_aliases.rs b/vendor/ucd-parse/src/property_aliases.rs new file mode 100644 index 000000000..f94a116e6 --- /dev/null +++ b/vendor/ucd-parse/src/property_aliases.rs @@ -0,0 +1,113 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::UcdFile; +use crate::error::Error; + +/// A single row in the `PropertyAliases.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct PropertyAlias { + /// An abbreviation for this property. + pub abbreviation: String, + /// The "long" name of this property. + pub long: String, + /// Additional aliases (if present). + pub aliases: Vec<String>, +} + +impl UcdFile for PropertyAlias { + fn relative_file_path() -> &'static Path { + Path::new("PropertyAliases.txt") + } +} + +impl FromStr for PropertyAlias { + type Err = Error; + + fn from_str(line: &str) -> Result<PropertyAlias, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P<abbrev>[^\s;]+)\s*; + \s*(?P<long>[^\s;]+)\s* + (?:;(?P<aliases>.*))? + " + ) + .unwrap(); + static ref ALIASES: Regex = + Regex::new(r"\s*(?P<alias>[^\s;]+)\s*;?\s*").unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid PropertyAliases line: '{}'", line), + }; + let mut aliases = vec![]; + if let Some(m) = caps.name("aliases") { + for acaps in ALIASES.captures_iter(m.as_str()) { + let alias = acaps.name("alias").unwrap().as_str(); + aliases.push(alias.to_string()); + } + } + Ok(PropertyAlias { + abbreviation: caps.name("abbrev").unwrap().as_str().to_string(), + long: caps.name("long").unwrap().as_str().to_string(), + aliases, + }) + } +} + +#[cfg(test)] +mod tests { + use super::PropertyAlias; + + #[test] + fn parse1() { + let line = "cjkAccountingNumeric ; kAccountingNumeric\n"; + let row: PropertyAlias = line.parse().unwrap(); + assert_eq!(row.abbreviation, "cjkAccountingNumeric"); + assert_eq!(row.long, "kAccountingNumeric"); + assert!(row.aliases.is_empty()); + } + + #[test] + fn parse2() { + let line = "nv ; Numeric_Value\n"; + let row: PropertyAlias = line.parse().unwrap(); + assert_eq!(row.abbreviation, "nv"); + assert_eq!(row.long, "Numeric_Value"); + assert!(row.aliases.is_empty()); + } + + #[test] + fn parse3() { + let line = + "scf ; Simple_Case_Folding ; sfc\n"; + let row: PropertyAlias = line.parse().unwrap(); + assert_eq!(row.abbreviation, "scf"); + assert_eq!(row.long, "Simple_Case_Folding"); + assert_eq!(row.aliases, vec!["sfc"]); + } + + #[test] + fn parse4() { + let line = "cjkRSUnicode ; kRSUnicode ; Unicode_Radical_Stroke; URS\n"; + let row: PropertyAlias = line.parse().unwrap(); + assert_eq!(row.abbreviation, "cjkRSUnicode"); + assert_eq!(row.long, "kRSUnicode"); + assert_eq!(row.aliases, vec!["Unicode_Radical_Stroke", "URS"]); + } + + #[test] + fn parse5() { + let line = "isc ; ISO_Comment"; + let row: PropertyAlias = line.parse().unwrap(); + assert_eq!(row.abbreviation, "isc"); + assert_eq!(row.long, "ISO_Comment"); + assert!(row.aliases.is_empty()); + } +} diff --git a/vendor/ucd-parse/src/property_value_aliases.rs b/vendor/ucd-parse/src/property_value_aliases.rs new file mode 100644 index 000000000..7e8a3c890 --- /dev/null +++ b/vendor/ucd-parse/src/property_value_aliases.rs @@ -0,0 +1,185 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::UcdFile; +use crate::error::Error; + +/// A single row in the `PropertyValueAliases.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct PropertyValueAlias { + /// The property name for which this value alias applies. + pub property: String, + /// A numeric abbreviation for this property value, if present. (This is + /// seemingly only present for the `ccc`/`Canonical_Combining_Class` + /// property.) + pub numeric: Option<u8>, + /// An abbreviation for this property value. + pub abbreviation: String, + /// The "long" form of this property value. + pub long: String, + /// Additional value aliases (if present). + pub aliases: Vec<String>, +} + +impl UcdFile for PropertyValueAlias { + fn relative_file_path() -> &'static Path { + Path::new("PropertyValueAliases.txt") + } +} + +impl FromStr for PropertyValueAlias { + type Err = Error; + + fn from_str(line: &str) -> Result<PropertyValueAlias, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P<prop>[^\s;]+)\s*; + \s*(?P<abbrev>[^\s;]+)\s*; + \s*(?P<long>[^\s;]+)\s* + (?:;(?P<aliases>.*))? + " + ) + .unwrap(); + static ref PARTS_CCC: Regex = Regex::new( + r"(?x) + ^ + ccc; + \s*(?P<num_class>[0-9]+)\s*; + \s*(?P<abbrev>[^\s;]+)\s*; + \s*(?P<long>[^\s;]+) + " + ) + .unwrap(); + static ref ALIASES: Regex = + Regex::new(r"\s*(?P<alias>[^\s;]+)\s*;?\s*").unwrap(); + }; + + if line.starts_with("ccc;") { + let caps = match PARTS_CCC.captures(line.trim()) { + Some(caps) => caps, + None => { + return err!("invalid PropertyValueAliases (ccc) line") + } + }; + let n = match caps["num_class"].parse() { + Ok(n) => n, + Err(err) => { + return err!( + "failed to parse ccc number '{}': {}", + &caps["num_class"], + err + ) + } + }; + let abbrev = caps.name("abbrev").unwrap().as_str(); + let long = caps.name("long").unwrap().as_str(); + return Ok(PropertyValueAlias { + property: line[0..3].to_string(), + numeric: Some(n), + abbreviation: abbrev.to_string(), + long: long.to_string(), + aliases: vec![], + }); + } + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid PropertyValueAliases line"), + }; + let mut aliases = vec![]; + if let Some(m) = caps.name("aliases") { + for acaps in ALIASES.captures_iter(m.as_str()) { + let alias = acaps.name("alias").unwrap().as_str(); + if alias == "#" { + // This starts a comment, so stop reading. + break; + } + aliases.push(alias.to_string()); + } + } + Ok(PropertyValueAlias { + property: caps.name("prop").unwrap().as_str().to_string(), + numeric: None, + abbreviation: caps.name("abbrev").unwrap().as_str().to_string(), + long: caps.name("long").unwrap().as_str().to_string(), + aliases, + }) + } +} + +#[cfg(test)] +mod tests { + use super::PropertyValueAlias; + + #[test] + fn parse1() { + let line = "blk; Arabic_PF_A ; Arabic_Presentation_Forms_A ; Arabic_Presentation_Forms-A\n"; + let row: PropertyValueAlias = line.parse().unwrap(); + assert_eq!(row.property, "blk"); + assert_eq!(row.numeric, None); + assert_eq!(row.abbreviation, "Arabic_PF_A"); + assert_eq!(row.long, "Arabic_Presentation_Forms_A"); + assert_eq!(row.aliases, vec!["Arabic_Presentation_Forms-A"]); + } + + #[test] + fn parse2() { + let line = "AHex; N ; No ; F ; False\n"; + let row: PropertyValueAlias = line.parse().unwrap(); + assert_eq!(row.property, "AHex"); + assert_eq!(row.numeric, None); + assert_eq!(row.abbreviation, "N"); + assert_eq!(row.long, "No"); + assert_eq!(row.aliases, vec!["F", "False"]); + } + + #[test] + fn parse3() { + let line = "age; 1.1 ; V1_1\n"; + let row: PropertyValueAlias = line.parse().unwrap(); + assert_eq!(row.property, "age"); + assert_eq!(row.numeric, None); + assert_eq!(row.abbreviation, "1.1"); + assert_eq!(row.long, "V1_1"); + assert!(row.aliases.is_empty()); + } + + #[test] + fn parse4() { + let line = "ccc; 0; NR ; Not_Reordered\n"; + let row: PropertyValueAlias = line.parse().unwrap(); + assert_eq!(row.property, "ccc"); + assert_eq!(row.numeric, Some(0)); + assert_eq!(row.abbreviation, "NR"); + assert_eq!(row.long, "Not_Reordered"); + assert!(row.aliases.is_empty()); + } + + #[test] + fn parse5() { + let line = + "ccc; 133; CCC133 ; CCC133 # RESERVED\n"; + let row: PropertyValueAlias = line.parse().unwrap(); + assert_eq!(row.property, "ccc"); + assert_eq!(row.numeric, Some(133)); + assert_eq!(row.abbreviation, "CCC133"); + assert_eq!(row.long, "CCC133"); + assert!(row.aliases.is_empty()); + } + + #[test] + fn parse6() { + let line = "gc ; P ; Punctuation ; punct # Pc | Pd | Pe | Pf | Pi | Po | Ps\n"; + let row: PropertyValueAlias = line.parse().unwrap(); + assert_eq!(row.property, "gc"); + assert_eq!(row.numeric, None); + assert_eq!(row.abbreviation, "P"); + assert_eq!(row.long, "Punctuation"); + assert_eq!(row.aliases, vec!["punct"]); + } +} diff --git a/vendor/ucd-parse/src/script_extensions.rs b/vendor/ucd-parse/src/script_extensions.rs new file mode 100644 index 000000000..050e1f039 --- /dev/null +++ b/vendor/ucd-parse/src/script_extensions.rs @@ -0,0 +1,68 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `ScriptExtensions.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct ScriptExtension { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The script extension names assigned to the codepoints in this entry. + pub scripts: Vec<String>, +} + +impl UcdFile for ScriptExtension { + fn relative_file_path() -> &'static Path { + Path::new("ScriptExtensions.txt") + } +} + +impl UcdFileByCodepoint for ScriptExtension { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for ScriptExtension { + type Err = Error; + + fn from_str(line: &str) -> Result<ScriptExtension, Error> { + let (codepoints, scripts) = parse_codepoint_association(line)?; + Ok(ScriptExtension { + codepoints, + scripts: scripts.split_whitespace().map(str::to_string).collect(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::ScriptExtension; + + #[test] + fn parse_single() { + let line = "060C ; Arab Syrc Thaa # Po ARABIC COMMA\n"; + let row: ScriptExtension = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x060C); + assert_eq!(row.scripts, vec!["Arab", "Syrc", "Thaa"]); + } + + #[test] + fn parse_range() { + let line = "A836..A837 ; Deva Gujr Guru Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK\n"; + let row: ScriptExtension = line.parse().unwrap(); + assert_eq!(row.codepoints, (0xA836, 0xA837)); + assert_eq!( + row.scripts, + vec![ + "Deva", "Gujr", "Guru", "Kthi", "Mahj", "Modi", "Sind", + "Takr", "Tirh", + ] + ); + } +} diff --git a/vendor/ucd-parse/src/scripts.rs b/vendor/ucd-parse/src/scripts.rs new file mode 100644 index 000000000..6021912c4 --- /dev/null +++ b/vendor/ucd-parse/src/scripts.rs @@ -0,0 +1,59 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `Scripts.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Script { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The script name assigned to the codepoints in this entry. + pub script: String, +} + +impl UcdFile for Script { + fn relative_file_path() -> &'static Path { + Path::new("Scripts.txt") + } +} + +impl UcdFileByCodepoint for Script { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for Script { + type Err = Error; + + fn from_str(line: &str) -> Result<Script, Error> { + let (codepoints, script) = parse_codepoint_association(line)?; + Ok(Script { codepoints, script: script.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::Script; + + #[test] + fn parse_single() { + let line = "10A7F ; Old_South_Arabian # Po OLD SOUTH ARABIAN NUMERIC INDICATOR\n"; + let row: Script = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x10A7F); + assert_eq!(row.script, "Old_South_Arabian"); + } + + #[test] + fn parse_range() { + let line = "1200..1248 ; Ethiopic # Lo [73] ETHIOPIC SYLLABLE HA..ETHIOPIC SYLLABLE QWA\n"; + let row: Script = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x1200, 0x1248)); + assert_eq!(row.script, "Ethiopic"); + } +} diff --git a/vendor/ucd-parse/src/sentence_break.rs b/vendor/ucd-parse/src/sentence_break.rs new file mode 100644 index 000000000..74a6e8a08 --- /dev/null +++ b/vendor/ucd-parse/src/sentence_break.rs @@ -0,0 +1,101 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_break_test, parse_codepoint_association, CodepointIter, Codepoints, + UcdFile, UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `auxiliary/SentenceBreakProperty.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct SentenceBreak { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property value assigned to the codepoints in this entry. + pub value: String, +} + +impl UcdFile for SentenceBreak { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/SentenceBreakProperty.txt") + } +} + +impl UcdFileByCodepoint for SentenceBreak { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for SentenceBreak { + type Err = Error; + + fn from_str(line: &str) -> Result<SentenceBreak, Error> { + let (codepoints, value) = parse_codepoint_association(line)?; + Ok(SentenceBreak { codepoints, value: value.to_string() }) + } +} + +/// A single row in the `auxiliary/SentenceBreakTest.txt` file. +/// +/// This file defines tests for the sentence break algorithm. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct SentenceBreakTest { + /// Each string is a UTF-8 encoded group of codepoints that make up a + /// single sentence. + pub sentences: Vec<String>, + /// A human readable description of this test. + pub comment: String, +} + +impl UcdFile for SentenceBreakTest { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/SentenceBreakTest.txt") + } +} + +impl FromStr for SentenceBreakTest { + type Err = Error; + + fn from_str(line: &str) -> Result<SentenceBreakTest, Error> { + let (groups, comment) = parse_break_test(line)?; + Ok(SentenceBreakTest { sentences: groups, comment }) + } +} + +#[cfg(test)] +mod tests { + use super::{SentenceBreak, SentenceBreakTest}; + + #[test] + fn parse_single() { + let line = "11445 ; Extend # Mc NEWA SIGN VISARGA\n"; + let row: SentenceBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x11445); + assert_eq!(row.value, "Extend"); + } + + #[test] + fn parse_range() { + let line = "FE31..FE32 ; SContinue # Pd [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH\n"; + let row: SentenceBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, (0xFE31, 0xFE32)); + assert_eq!(row.value, "SContinue"); + } + + #[test] + fn parse_test() { + let line = "÷ 2060 × 5B57 × 2060 × 002E × 2060 ÷ 5B57 × 2060 × 2060 ÷ # ÷ [0.2] WORD JOINER (Format_FE) × [998.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [998.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]"; + + let row: SentenceBreakTest = line.parse().unwrap(); + assert_eq!( + row.sentences, + vec![ + "\u{2060}\u{5B57}\u{2060}\u{002E}\u{2060}", + "\u{5B57}\u{2060}\u{2060}", + ] + ); + assert!(row.comment.contains("[5.0] WORD JOINER (Format_FE)")); + } +} diff --git a/vendor/ucd-parse/src/special_casing.rs b/vendor/ucd-parse/src/special_casing.rs new file mode 100644 index 000000000..a8fc61ddb --- /dev/null +++ b/vendor/ucd-parse/src/special_casing.rs @@ -0,0 +1,112 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{ + parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `SpecialCasing.txt` file. +/// +/// Note that a single codepoint may be mapped multiple times. In particular, +/// a single codepoint might have mappings based on distinct language sensitive +/// conditions (e.g., `U+0307`). +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct SpecialCaseMapping { + /// The codepoint that is being mapped. + pub codepoint: Codepoint, + /// The lowercase mapping, which may be empty. + pub lowercase: Vec<Codepoint>, + /// The titlecase mapping, which may be empty. + pub titlecase: Vec<Codepoint>, + /// The uppercase mapping, which may be empty. + pub uppercase: Vec<Codepoint>, + /// A list of language specific conditions, see `SpecialCasing.txt` for + /// more details. + pub conditions: Vec<String>, +} + +impl UcdFile for SpecialCaseMapping { + fn relative_file_path() -> &'static Path { + Path::new("SpecialCasing.txt") + } +} + +impl UcdFileByCodepoint for SpecialCaseMapping { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for SpecialCaseMapping { + type Err = Error; + + fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P<codepoint>[^\s;]+)\s*; + \s*(?P<lower>[^;]+)\s*; + \s*(?P<title>[^;]+)\s*; + \s*(?P<upper>[^;]+)\s*; + \s*(?P<conditions>[^;\x23]+)? + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid SpecialCasing line: '{}'", line), + }; + let conditions = caps + .name("conditions") + .map(|x| { + x.as_str() + .trim() + .split_whitespace() + .map(|c| c.to_string()) + .collect() + }) + .unwrap_or(vec![]); + Ok(SpecialCaseMapping { + codepoint: caps["codepoint"].parse()?, + lowercase: parse_codepoint_sequence(&caps["lower"])?, + titlecase: parse_codepoint_sequence(&caps["title"])?, + uppercase: parse_codepoint_sequence(&caps["upper"])?, + conditions, + }) + } +} + +#[cfg(test)] +mod tests { + use super::SpecialCaseMapping; + + #[test] + fn parse_no_conds() { + let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA\n"; + let row: SpecialCaseMapping = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x1F52); + assert_eq!(row.lowercase, vec![0x1F52]); + assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]); + assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]); + assert!(row.conditions.is_empty()); + } + + #[test] + fn parse_conds() { + let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\n"; + let row: SpecialCaseMapping = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0307); + assert!(row.lowercase.is_empty()); + assert_eq!(row.titlecase, vec![0x0307]); + assert_eq!(row.uppercase, vec![0x0307]); + assert_eq!(row.conditions, vec!["tr", "After_I"]); + } +} diff --git a/vendor/ucd-parse/src/unicode_data.rs b/vendor/ucd-parse/src/unicode_data.rs new file mode 100644 index 000000000..87910cc1d --- /dev/null +++ b/vendor/ucd-parse/src/unicode_data.rs @@ -0,0 +1,787 @@ +use std::fmt; +use std::iter; +use std::ops::Range; +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// Represents a single row in the `UnicodeData.txt` file. +/// +/// These fields were taken from UAX44, Table 9, as part of the documentation +/// for the +/// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt). +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct UnicodeData { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The name of this codepoint. + pub name: String, + /// The "general category" of this codepoint. + pub general_category: String, + /// The class of this codepoint used in the Canonical Ordering Algorithm. + /// + /// Note that some classes map to a particular symbol. See + /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values). + pub canonical_combining_class: u8, + /// The bidirectional class of this codepoint. + /// + /// Possible values are listed in + /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values). + pub bidi_class: String, + /// The decomposition mapping for this codepoint. This includes its + /// formatting tag (if present). + pub decomposition: UnicodeDataDecomposition, + /// A decimal numeric representation of this codepoint, if it has the + /// property `Numeric_Type=Decimal`. + pub numeric_type_decimal: Option<u8>, + /// A decimal numeric representation of this codepoint, if it has the + /// property `Numeric_Type=Digit`. Note that while this field is still + /// populated for existing codepoints, no new codepoints will have this + /// field populated. + pub numeric_type_digit: Option<u8>, + /// A decimal or rational numeric representation of this codepoint, if it + /// has the property `Numeric_Type=Numeric`. + pub numeric_type_numeric: Option<UnicodeDataNumeric>, + /// A boolean indicating whether this codepoint is "mirrored" in + /// bidirectional text. + pub bidi_mirrored: bool, + /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that + /// this field is empty unless it is significantly different from + /// the `name` field. + pub unicode1_name: String, + /// The ISO 10464 comment field. This no longer contains any non-NULL + /// values. + pub iso_comment: String, + /// This codepoint's simple uppercase mapping, if it exists. + pub simple_uppercase_mapping: Option<Codepoint>, + /// This codepoint's simple lowercase mapping, if it exists. + pub simple_lowercase_mapping: Option<Codepoint>, + /// This codepoint's simple titlecase mapping, if it exists. + pub simple_titlecase_mapping: Option<Codepoint>, +} + +impl UcdFile for UnicodeData { + fn relative_file_path() -> &'static Path { + Path::new("UnicodeData.txt") + } +} + +impl UcdFileByCodepoint for UnicodeData { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl UnicodeData { + /// Returns true if and only if this record corresponds to the start of a + /// range. + pub fn is_range_start(&self) -> bool { + self.name.starts_with('<') + && self.name.ends_with('>') + && self.name.contains("First") + } + + /// Returns true if and only if this record corresponds to the end of a + /// range. + pub fn is_range_end(&self) -> bool { + self.name.starts_with('<') + && self.name.ends_with('>') + && self.name.contains("Last") + } +} + +impl FromStr for UnicodeData { + type Err = Error; + + fn from_str(line: &str) -> Result<UnicodeData, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + ([A-Z0-9]+); # 1; codepoint + ([^;]+); # 2; name + ([^;]+); # 3; general category + ([0-9]+); # 4; canonical combining class + ([^;]+); # 5; bidi class + ([^;]*); # 6; decomposition + ([0-9]*); # 7; numeric type decimal + ([0-9]*); # 8; numeric type digit + ([-0-9/]*); # 9; numeric type numeric + ([YN]); # 10; bidi mirrored + ([^;]*); # 11; unicode1 name + ([^;]*); # 12; ISO comment + ([^;]*); # 13; simple uppercase mapping + ([^;]*); # 14; simple lowercase mapping + ([^;]*) # 15; simple titlecase mapping + $ + " + ) + .unwrap(); + }; + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid UnicodeData line"), + }; + let capget = |n| caps.get(n).unwrap().as_str(); + let mut data = UnicodeData::default(); + + data.codepoint = capget(1).parse()?; + data.name = capget(2).to_string(); + data.general_category = capget(3).to_string(); + data.canonical_combining_class = match capget(4).parse() { + Ok(n) => n, + Err(err) => { + return err!( + "failed to parse canonical combining class '{}': {}", + capget(4), + err + ) + } + }; + data.bidi_class = capget(5).to_string(); + if !caps[6].is_empty() { + data.decomposition = caps[6].parse()?; + } else { + data.decomposition.push(data.codepoint)?; + } + if !capget(7).is_empty() { + data.numeric_type_decimal = Some(match capget(7).parse() { + Ok(n) => n, + Err(err) => { + return err!( + "failed to parse numeric type decimal '{}': {}", + capget(7), + err + ) + } + }); + } + if !capget(8).is_empty() { + data.numeric_type_digit = Some(match capget(8).parse() { + Ok(n) => n, + Err(err) => { + return err!( + "failed to parse numeric type digit '{}': {}", + capget(8), + err + ) + } + }); + } + if !capget(9).is_empty() { + data.numeric_type_numeric = Some(capget(9).parse()?); + } + data.bidi_mirrored = capget(10) == "Y"; + data.unicode1_name = capget(11).to_string(); + data.iso_comment = capget(12).to_string(); + if !capget(13).is_empty() { + data.simple_uppercase_mapping = Some(capget(13).parse()?); + } + if !capget(14).is_empty() { + data.simple_lowercase_mapping = Some(capget(14).parse()?); + } + if !capget(15).is_empty() { + data.simple_titlecase_mapping = Some(capget(15).parse()?); + } + Ok(data) + } +} + +impl fmt::Display for UnicodeData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{};", self.codepoint)?; + write!(f, "{};", self.name)?; + write!(f, "{};", self.general_category)?; + write!(f, "{};", self.canonical_combining_class)?; + write!(f, "{};", self.bidi_class)?; + if self.decomposition.is_canonical() + && self.decomposition.mapping() == &[self.codepoint] + { + write!(f, ";")?; + } else { + write!(f, "{};", self.decomposition)?; + } + if let Some(n) = self.numeric_type_decimal { + write!(f, "{};", n)?; + } else { + write!(f, ";")?; + } + if let Some(n) = self.numeric_type_digit { + write!(f, "{};", n)?; + } else { + write!(f, ";")?; + } + if let Some(n) = self.numeric_type_numeric { + write!(f, "{};", n)?; + } else { + write!(f, ";")?; + } + write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?; + write!(f, "{};", self.unicode1_name)?; + write!(f, "{};", self.iso_comment)?; + if let Some(cp) = self.simple_uppercase_mapping { + write!(f, "{};", cp)?; + } else { + write!(f, ";")?; + } + if let Some(cp) = self.simple_lowercase_mapping { + write!(f, "{};", cp)?; + } else { + write!(f, ";")?; + } + if let Some(cp) = self.simple_titlecase_mapping { + write!(f, "{}", cp)?; + } + Ok(()) + } +} + +/// Represents a decomposition mapping of a single row in the +/// `UnicodeData.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct UnicodeDataDecomposition { + /// The formatting tag associated with this mapping, if present. + pub tag: Option<UnicodeDataDecompositionTag>, + /// The number of codepoints in this mapping. + pub len: usize, + /// The codepoints in the mapping. Entries beyond `len` in the mapping + /// are always U+0000. If no mapping was present, then this always contains + /// a single codepoint corresponding to this row's character. + pub mapping: [Codepoint; 18], +} + +impl UnicodeDataDecomposition { + /// Create a new decomposition mapping with the given tag and codepoints. + /// + /// If there are too many codepoints, then an error is returned. + pub fn new( + tag: Option<UnicodeDataDecompositionTag>, + mapping: &[Codepoint], + ) -> Result<UnicodeDataDecomposition, Error> { + let mut x = UnicodeDataDecomposition::default(); + x.tag = tag; + for &cp in mapping { + x.push(cp)?; + } + Ok(x) + } + + /// Add a new codepoint to this decomposition's mapping. + /// + /// If the mapping is already full, then this returns an error. + pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> { + if self.len >= self.mapping.len() { + return err!( + "invalid decomposition mapping (too many codepoints)" + ); + } + self.mapping[self.len] = cp; + self.len += 1; + Ok(()) + } + + /// Return the mapping as a slice of codepoints. The slice returned + /// has length equivalent to the number of codepoints in this mapping. + pub fn mapping(&self) -> &[Codepoint] { + &self.mapping[..self.len] + } + + /// Returns true if and only if this decomposition mapping is canonical. + pub fn is_canonical(&self) -> bool { + self.tag.is_none() + } +} + +impl FromStr for UnicodeDataDecomposition { + type Err = Error; + + fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> { + lazy_static! { + static ref WITH_TAG: Regex = Regex::new( + r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$" + ) + .unwrap(); + static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap(); + }; + if s.is_empty() { + return err!( + "expected non-empty string for \ + UnicodeDataDecomposition value" + ); + } + let caps = match WITH_TAG.captures(s) { + Some(caps) => caps, + None => return err!("invalid decomposition value"), + }; + let mut decomp = UnicodeDataDecomposition::default(); + let mut codepoints = s; + if let Some(m) = caps.name("tag") { + decomp.tag = Some(m.as_str().parse()?); + codepoints = &caps["chars"]; + } + for m in CHARS.find_iter(codepoints) { + let cp = m.as_str().parse()?; + decomp.push(cp)?; + } + Ok(decomp) + } +} + +impl fmt::Display for UnicodeDataDecomposition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(ref tag) = self.tag { + write!(f, "<{}> ", tag)?; + } + let mut first = true; + for cp in self.mapping() { + if !first { + write!(f, " ")?; + } + first = false; + write!(f, "{}", cp)?; + } + Ok(()) + } +} + +/// The formatting tag on a decomposition mapping. +/// +/// This is taken from +/// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings). +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum UnicodeDataDecompositionTag { + /// <font> + Font, + /// <noBreak> + NoBreak, + /// <initial> + Initial, + /// <medial> + Medial, + /// <final> + Final, + /// <isolated> + Isolated, + /// <circle> + Circle, + /// <super> + Super, + /// <sub> + Sub, + /// <vertical> + Vertical, + /// <wide> + Wide, + /// <narrow> + Narrow, + /// <small> + Small, + /// <square> + Square, + /// <fraction> + Fraction, + /// <compat> + Compat, +} + +impl FromStr for UnicodeDataDecompositionTag { + type Err = Error; + + fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> { + use self::UnicodeDataDecompositionTag::*; + Ok(match s { + "font" => Font, + "noBreak" => NoBreak, + "initial" => Initial, + "medial" => Medial, + "final" => Final, + "isolated" => Isolated, + "circle" => Circle, + "super" => Super, + "sub" => Sub, + "vertical" => Vertical, + "wide" => Wide, + "narrow" => Narrow, + "small" => Small, + "square" => Square, + "fraction" => Fraction, + "compat" => Compat, + _ => return err!("invalid decomposition formatting tag: {}", s), + }) + } +} + +impl fmt::Display for UnicodeDataDecompositionTag { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::UnicodeDataDecompositionTag::*; + let s = match *self { + Font => "font", + NoBreak => "noBreak", + Initial => "initial", + Medial => "medial", + Final => "final", + Isolated => "isolated", + Circle => "circle", + Super => "super", + Sub => "sub", + Vertical => "vertical", + Wide => "wide", + Narrow => "narrow", + Small => "small", + Square => "square", + Fraction => "fraction", + Compat => "compat", + }; + write!(f, "{}", s) + } +} + +/// A numeric value corresponding to characters with `Numeric_Type=Numeric`. +/// +/// A numeric value can either be a signed integer or a rational number. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum UnicodeDataNumeric { + /// An integer. + Integer(i64), + /// A rational number. The first is the numerator and the latter is the + /// denominator. + Rational(i64, i64), +} + +impl FromStr for UnicodeDataNumeric { + type Err = Error; + + fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> { + if s.is_empty() { + return err!( + "expected non-empty string for UnicodeDataNumeric value" + ); + } + if let Some(pos) = s.find('/') { + let (snum, sden) = (&s[..pos], &s[pos + 1..]); + let num = match snum.parse() { + Ok(num) => num, + Err(err) => { + return err!( + "invalid integer numerator '{}': {}", + snum, + err + ); + } + }; + let den = match sden.parse() { + Ok(den) => den, + Err(err) => { + return err!( + "invalid integer denominator '{}': {}", + sden, + err + ); + } + }; + Ok(UnicodeDataNumeric::Rational(num, den)) + } else { + match s.parse() { + Ok(den) => Ok(UnicodeDataNumeric::Integer(den)), + Err(err) => { + return err!( + "invalid integer denominator '{}': {}", + s, + err + ); + } + } + } + } +} + +impl fmt::Display for UnicodeDataNumeric { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + UnicodeDataNumeric::Integer(n) => write!(f, "{}", n), + UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d), + } + } +} + +/// An iterator adapter that expands rows in `UnicodeData.txt`. +/// +/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly +/// represented. Instead, they are represented by a pair of rows, indicating +/// a range of codepoints with the same properties. For example, the Hangul +/// syllable codepoints are represented by these two rows: +/// +/// ```ignore +/// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; +/// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; +/// ``` +/// +/// This iterator will wrap any iterator of `UnicodeData` and, when a range of +/// Unicode codepoints is found, it will be expanded to the appropriate +/// sequence of `UnicodeData` values. Note that all such expanded records will +/// have an empty name. +pub struct UnicodeDataExpander<I: Iterator> { + /// The underlying iterator. + it: iter::Peekable<I>, + /// A range of codepoints to emit when we've found a pair. Otherwise, + /// `None`. + range: CodepointRange, +} + +struct CodepointRange { + /// The codepoint range. + range: Range<u32>, + /// The start record. All subsequent records in this range are generated + /// by cloning this and updating the codepoint/name. + start_record: UnicodeData, +} + +impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> { + /// Create a new iterator that expands pairs of `UnicodeData` range + /// records. All other records are passed through as-is. + pub fn new<T>(it: T) -> UnicodeDataExpander<I> + where + T: IntoIterator<IntoIter = I, Item = I::Item>, + { + UnicodeDataExpander { + it: it.into_iter().peekable(), + range: CodepointRange { + range: 0..0, + start_record: UnicodeData::default(), + }, + } + } +} + +impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> { + type Item = UnicodeData; + + fn next(&mut self) -> Option<UnicodeData> { + if let Some(udata) = self.range.next() { + return Some(udata); + } + let row1 = match self.it.next() { + None => return None, + Some(row1) => row1, + }; + if !row1.is_range_start() + || !self.it.peek().map_or(false, |row2| row2.is_range_end()) + { + return Some(row1); + } + let row2 = self.it.next().unwrap(); + self.range = CodepointRange { + range: row1.codepoint.value()..(row2.codepoint.value() + 1), + start_record: row1, + }; + self.next() + } +} + +impl Iterator for CodepointRange { + type Item = UnicodeData; + + fn next(&mut self) -> Option<UnicodeData> { + let cp = match self.range.next() { + None => return None, + Some(cp) => cp, + }; + Some(UnicodeData { + codepoint: Codepoint::from_u32(cp).unwrap(), + name: "".to_string(), + ..self.start_record.clone() + }) + } +} + +#[cfg(test)] +mod tests { + use crate::common::Codepoint; + + use super::{ + UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, + UnicodeDataNumeric, + }; + + fn codepoint(n: u32) -> Codepoint { + Codepoint::from_u32(n).unwrap() + } + + fn s(string: &str) -> String { + string.to_string() + } + + #[test] + fn parse1() { + let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x249d), + name: s("PARENTHESIZED LATIN SMALL LETTER B"), + general_category: s("So"), + canonical_combining_class: 0, + bidi_class: s("L"), + decomposition: UnicodeDataDecomposition::new( + Some(UnicodeDataDecompositionTag::Compat), + &[codepoint(0x28), codepoint(0x62), codepoint(0x29)], + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: None, + bidi_mirrored: false, + unicode1_name: s(""), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: None, + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn parse2() { + let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x000D), + name: s("<control>"), + general_category: s("Cc"), + canonical_combining_class: 0, + bidi_class: s("B"), + decomposition: UnicodeDataDecomposition::new( + None, + &[codepoint(0x000D)] + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: None, + bidi_mirrored: false, + unicode1_name: s("CARRIAGE RETURN (CR)"), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: None, + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn parse3() { + let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x00BC), + name: s("VULGAR FRACTION ONE QUARTER"), + general_category: s("No"), + canonical_combining_class: 0, + bidi_class: s("ON"), + decomposition: UnicodeDataDecomposition::new( + Some(UnicodeDataDecompositionTag::Fraction), + &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)], + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)), + bidi_mirrored: false, + unicode1_name: s("FRACTION ONE QUARTER"), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: None, + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn parse4() { + let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x0041), + name: s("LATIN CAPITAL LETTER A"), + general_category: s("Lu"), + canonical_combining_class: 0, + bidi_class: s("L"), + decomposition: UnicodeDataDecomposition::new( + None, + &[codepoint(0x0041)] + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: None, + bidi_mirrored: false, + unicode1_name: s(""), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: Some(codepoint(0x0061)), + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn parse5() { + let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x0F33), + name: s("TIBETAN DIGIT HALF ZERO"), + general_category: s("No"), + canonical_combining_class: 0, + bidi_class: s("L"), + decomposition: UnicodeDataDecomposition::new( + None, + &[codepoint(0x0F33)] + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: Some(UnicodeDataNumeric::Rational( + -1, 2 + )), + bidi_mirrored: false, + unicode1_name: s(""), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: None, + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn expander() { + use super::UnicodeDataExpander; + use crate::common::UcdLineParser; + + let data = "\ +ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; +D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; +D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;; +"; + let records = UcdLineParser::new(None, data.as_bytes()) + .collect::<Result<Vec<_>, _>>() + .unwrap(); + assert_eq!(UnicodeDataExpander::new(records).count(), 11174); + } +} diff --git a/vendor/ucd-parse/src/word_break.rs b/vendor/ucd-parse/src/word_break.rs new file mode 100644 index 000000000..57d512667 --- /dev/null +++ b/vendor/ucd-parse/src/word_break.rs @@ -0,0 +1,103 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_break_test, parse_codepoint_association, CodepointIter, Codepoints, + UcdFile, UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `auxiliary/WordBreakProperty.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct WordBreak { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property value assigned to the codepoints in this entry. + pub value: String, +} + +impl UcdFile for WordBreak { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/WordBreakProperty.txt") + } +} + +impl UcdFileByCodepoint for WordBreak { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for WordBreak { + type Err = Error; + + fn from_str(line: &str) -> Result<WordBreak, Error> { + let (codepoints, value) = parse_codepoint_association(line)?; + Ok(WordBreak { codepoints, value: value.to_string() }) + } +} + +/// A single row in the `auxiliary/WordBreakTest.txt` file. +/// +/// This file defines tests for the word break algorithm. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct WordBreakTest { + /// Each string is a UTF-8 encoded group of codepoints that make up a + /// single word. + pub words: Vec<String>, + /// A human readable description of this test. + pub comment: String, +} + +impl UcdFile for WordBreakTest { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/WordBreakTest.txt") + } +} + +impl FromStr for WordBreakTest { + type Err = Error; + + fn from_str(line: &str) -> Result<WordBreakTest, Error> { + let (groups, comment) = parse_break_test(line)?; + Ok(WordBreakTest { words: groups, comment }) + } +} + +#[cfg(test)] +mod tests { + use super::{WordBreak, WordBreakTest}; + + #[test] + fn parse_single() { + let line = "0A83 ; Extend # Mc GUJARATI SIGN VISARGA\n"; + let row: WordBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x0A83); + assert_eq!(row.value, "Extend"); + } + + #[test] + fn parse_range() { + let line = "104A0..104A9 ; Numeric # Nd [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE\n"; + let row: WordBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x104A0, 0x104A9)); + assert_eq!(row.value, "Numeric"); + } + + #[test] + fn parse_test() { + let line = "÷ 0031 ÷ 0027 × 0308 ÷ 0061 ÷ 0027 × 2060 ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]"; + + let row: WordBreakTest = line.parse().unwrap(); + assert_eq!( + row.words, + vec![ + "\u{0031}", + "\u{0027}\u{0308}", + "\u{0061}", + "\u{0027}\u{2060}", + ] + ); + assert!(row.comment.contains("[4.0] COMBINING DIAERESIS (Extend_FE)")); + } +} |