From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/ucd-parse/.cargo-checksum.json | 1 + vendor/ucd-parse/Cargo.toml | 31 + vendor/ucd-parse/LICENSE-APACHE | 201 +++++++ vendor/ucd-parse/LICENSE-MIT | 21 + vendor/ucd-parse/README.md | 22 + vendor/ucd-parse/src/age.rs | 59 ++ vendor/ucd-parse/src/arabic_shaping.rs | 184 ++++++ vendor/ucd-parse/src/bidi_mirroring_glyph.rs | 107 ++++ vendor/ucd-parse/src/case_folding.rs | 161 +++++ vendor/ucd-parse/src/common.rs | 594 +++++++++++++++++++ vendor/ucd-parse/src/core_properties.rs | 60 ++ vendor/ucd-parse/src/emoji_properties.rs | 86 +++ vendor/ucd-parse/src/error.rs | 86 +++ vendor/ucd-parse/src/grapheme_cluster_break.rs | 98 +++ vendor/ucd-parse/src/jamo_short_name.rs | 80 +++ vendor/ucd-parse/src/lib.rs | 66 +++ vendor/ucd-parse/src/line_break.rs | 49 ++ vendor/ucd-parse/src/name_aliases.rs | 145 +++++ vendor/ucd-parse/src/prop_list.rs | 63 ++ vendor/ucd-parse/src/property_aliases.rs | 113 ++++ vendor/ucd-parse/src/property_value_aliases.rs | 185 ++++++ vendor/ucd-parse/src/script_extensions.rs | 68 +++ vendor/ucd-parse/src/scripts.rs | 59 ++ vendor/ucd-parse/src/sentence_break.rs | 101 ++++ vendor/ucd-parse/src/special_casing.rs | 112 ++++ vendor/ucd-parse/src/unicode_data.rs | 787 +++++++++++++++++++++++++ vendor/ucd-parse/src/word_break.rs | 103 ++++ 27 files changed, 3642 insertions(+) create mode 100644 vendor/ucd-parse/.cargo-checksum.json create mode 100644 vendor/ucd-parse/Cargo.toml create mode 100644 vendor/ucd-parse/LICENSE-APACHE create mode 100644 vendor/ucd-parse/LICENSE-MIT create mode 100644 vendor/ucd-parse/README.md create mode 100644 vendor/ucd-parse/src/age.rs create mode 100644 vendor/ucd-parse/src/arabic_shaping.rs create mode 100644 vendor/ucd-parse/src/bidi_mirroring_glyph.rs create mode 100644 vendor/ucd-parse/src/case_folding.rs create mode 100644 vendor/ucd-parse/src/common.rs create mode 100644 vendor/ucd-parse/src/core_properties.rs create mode 100644 vendor/ucd-parse/src/emoji_properties.rs create mode 100644 vendor/ucd-parse/src/error.rs create mode 100644 vendor/ucd-parse/src/grapheme_cluster_break.rs create mode 100644 vendor/ucd-parse/src/jamo_short_name.rs create mode 100644 vendor/ucd-parse/src/lib.rs create mode 100644 vendor/ucd-parse/src/line_break.rs create mode 100644 vendor/ucd-parse/src/name_aliases.rs create mode 100644 vendor/ucd-parse/src/prop_list.rs create mode 100644 vendor/ucd-parse/src/property_aliases.rs create mode 100644 vendor/ucd-parse/src/property_value_aliases.rs create mode 100644 vendor/ucd-parse/src/script_extensions.rs create mode 100644 vendor/ucd-parse/src/scripts.rs create mode 100644 vendor/ucd-parse/src/sentence_break.rs create mode 100644 vendor/ucd-parse/src/special_casing.rs create mode 100644 vendor/ucd-parse/src/unicode_data.rs create mode 100644 vendor/ucd-parse/src/word_break.rs (limited to 'vendor/ucd-parse') diff --git a/vendor/ucd-parse/.cargo-checksum.json b/vendor/ucd-parse/.cargo-checksum.json new file mode 100644 index 000000000..34cd1d5b5 --- /dev/null +++ b/vendor/ucd-parse/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"3a23e75f3807a38f86e8564a139135970f38c9ebc448749682b75fd4096f6d4a","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"5af803e482641f01332bde35cc8137211714b6f100122ec548c9712a09aead55","src/age.rs":"13a9a01b2373e9eff06b547543479394843cb9103c200b3e666ca5e408369bc9","src/arabic_shaping.rs":"31075e05b33248540f10ae5a3bb14715965e109b2be40cd9c0735810903ce29b","src/bidi_mirroring_glyph.rs":"945a110e0f54eabc2f48719893da300c11b4fd1f28265ab8f7b32ce2e5e3f6e5","src/case_folding.rs":"1ec85e0fa8e8cb0315974b843d168d9cddecad40efcf8ce78de677c0f0417f34","src/common.rs":"40782238affb569c9bd89a7ce19202677ba3e1da0bb5c8f8c4439adaa375858b","src/core_properties.rs":"24b261ed0bc4b7443734d843cda58433c7727914524ac4c3cc46fc153463e8cd","src/emoji_properties.rs":"bdb24a301661592d0956db2ad945a86778e0ad8f86cd82077835bb0d2a4f144c","src/error.rs":"6df32d4c5cc9819832083f465aa4ce11d26d3b44e37a9d4274a45fd8e1314903","src/grapheme_cluster_break.rs":"f63f75f1a5a82b698d4a840b063bc650f2b2f64429830dc338c9723bf1368e0b","src/jamo_short_name.rs":"02dc272c1a7d01de5e22737a76327b94ae2d132703dbc0657e3e887ceb1d1d91","src/lib.rs":"894ecd08e4588e14de69e8b9d25e9a38e9e2f73e260855c99df13c2ee1d825d3","src/line_break.rs":"1def7f73d44c1703fd18dbd9c9fc8dd76edabed27a5061564d6521d59335a95c","src/name_aliases.rs":"497629a0499d048b0f1615c619975f149c6a1d97361b7ff16850a8291796c90d","src/prop_list.rs":"856f00f51e7e6b9b0386a9b3941582eba63eb96896c86e58a791384a1235fdec","src/property_aliases.rs":"7b6da97e45a898499f29e30346f1b7aa6b7d758184a3bfa4f0b816d20edc9851","src/property_value_aliases.rs":"4e9fbad2b32ad636e5f8dfefa082352e444e4a68822a7786ea7d4217e7afd2fb","src/script_extensions.rs":"d967e213122702df642c975765fec28811ae8351f6f5307ca67989bf0b456fba","src/scripts.rs":"04740c080bb48e99d84622e4708215b40abdd387c70347d6b264b9c7fcbbac37","src/sentence_break.rs":"ac54a7f09f75694582904509d979c61784fa1ec647e4d531ea1b283bc3082635","src/special_casing.rs":"de7ed50ec34a222c73e8ad6d82a2a658b4475ce312301c5110d07fa13e51cb0b","src/unicode_data.rs":"cad99e17c6d56c9029416a0f3ec1b469786864eace2a20f212f2b4a1c96b59f1","src/word_break.rs":"eea514f238dc9dea82f52efc3154fde3f215b068dd201b22c31ef1c0acf1fba3"},"package":"5269f8d35df6b8b60758343a6d742ecf09e4bca13faee32af5503aebd1e11b7c"} \ No newline at end of file diff --git a/vendor/ucd-parse/Cargo.toml b/vendor/ucd-parse/Cargo.toml new file mode 100644 index 000000000..f7efa0970 --- /dev/null +++ b/vendor/ucd-parse/Cargo.toml @@ -0,0 +1,31 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "ucd-parse" +version = "0.1.8" +authors = ["Andrew Gallant "] +description = "A library for parsing data files in the Unicode character database.\n" +homepage = "https://github.com/BurntSushi/ucd-generate" +documentation = "https://docs.rs/ucd-parse" +readme = "README.md" +keywords = ["unicode", "database", "character", "property"] +license = "MIT/Apache-2.0" +repository = "https://github.com/BurntSushi/ucd-generate" +[dependencies.lazy_static] +version = "1" + +[dependencies.regex] +version = "1" +features = ["std", "unicode"] +default-features = false diff --git a/vendor/ucd-parse/LICENSE-APACHE b/vendor/ucd-parse/LICENSE-APACHE new file mode 100644 index 000000000..16fe87b06 --- /dev/null +++ b/vendor/ucd-parse/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/vendor/ucd-parse/LICENSE-MIT b/vendor/ucd-parse/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/vendor/ucd-parse/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/ucd-parse/README.md b/vendor/ucd-parse/README.md new file mode 100644 index 000000000..dc3f78dc6 --- /dev/null +++ b/vendor/ucd-parse/README.md @@ -0,0 +1,22 @@ +ucd-parse +========= +A library for parsing Unicode Character Database (UCD) files into structured +data. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ucd-generate.png)](https://travis-ci.org/BurntSushi/ucd-generate) +[![](http://meritbadge.herokuapp.com/ucd-generate)](https://crates.io/crates/ucd-parse) + + +### Documentation + +https://docs.rs/ucd-parse + + +### License + +This project is licensed under either of + * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or + http://www.apache.org/licenses/LICENSE-2.0) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or + http://opensource.org/licenses/MIT) +at your option. diff --git a/vendor/ucd-parse/src/age.rs b/vendor/ucd-parse/src/age.rs new file mode 100644 index 000000000..3c93f0707 --- /dev/null +++ b/vendor/ucd-parse/src/age.rs @@ -0,0 +1,59 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `DerivedAge.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Age { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The age assigned to the codepoints in this entry. + pub age: String, +} + +impl UcdFile for Age { + fn relative_file_path() -> &'static Path { + Path::new("DerivedAge.txt") + } +} + +impl UcdFileByCodepoint for Age { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for Age { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, script) = parse_codepoint_association(line)?; + Ok(Age { codepoints, age: script.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::Age; + + #[test] + fn parse_single() { + let line = "2BD2 ; 10.0 # GROUP MARK\n"; + let row: Age = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x2BD2); + assert_eq!(row.age, "10.0"); + } + + #[test] + fn parse_range() { + let line = "11D0B..11D36 ; 10.0 # [44] MASARAM GONDI LETTER AU..MASARAM GONDI VOWEL SIGN VOCALIC R\n"; + let row: Age = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x11D0B, 0x11D36)); + assert_eq!(row.age, "10.0"); + } +} diff --git a/vendor/ucd-parse/src/arabic_shaping.rs b/vendor/ucd-parse/src/arabic_shaping.rs new file mode 100644 index 000000000..d1d942a82 --- /dev/null +++ b/vendor/ucd-parse/src/arabic_shaping.rs @@ -0,0 +1,184 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// Represents a single row in the `ArabicShaping.txt` file. +/// +/// The field names were taken from the header of ArabicShaping.txt. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct ArabicShaping { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// A short schematic name for the codepoint. + /// + /// The schematic name is descriptive of the shape, based as consistently as + /// possible on a name for the skeleton and then the diacritic marks applied + /// to the skeleton, if any. Note that this schematic name is considered a + /// comment, and does not constitute a formal property value. + pub schematic_name: String, + /// The "joining type" of this codepoint. + pub joining_type: JoiningType, + /// The "joining group" of this codepoint. + pub joining_group: String, +} + +/// The Joining_Type field read from ArabicShaping.txt +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum JoiningType { + RightJoining, + LeftJoining, + DualJoining, + JoinCausing, + NonJoining, + Transparent, +} + +impl JoiningType { + pub fn as_str(&self) -> &str { + match self { + JoiningType::RightJoining => "R", + JoiningType::LeftJoining => "L", + JoiningType::DualJoining => "D", + JoiningType::JoinCausing => "C", + JoiningType::NonJoining => "U", + JoiningType::Transparent => "T", + } + } +} + +impl Default for JoiningType { + fn default() -> JoiningType { + JoiningType::NonJoining + } +} + +impl FromStr for JoiningType { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "R" => Ok(JoiningType::RightJoining), + "L" => Ok(JoiningType::LeftJoining), + "D" => Ok(JoiningType::DualJoining), + "C" => Ok(JoiningType::JoinCausing), + "U" => Ok(JoiningType::NonJoining), + "T" => Ok(JoiningType::Transparent), + _ => err!( + "unrecognized joining type: '{}' \ + (must be one of R, L, D, C, U or T)", + s + ), + } + } +} + +impl UcdFile for ArabicShaping { + fn relative_file_path() -> &'static Path { + Path::new("ArabicShaping.txt") + } +} + +impl UcdFileByCodepoint for ArabicShaping { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for ArabicShaping { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P[A-F0-9]+)\s*; + \s*(?P[^;]+)\s*; + \s*(?P[^;]+)\s*; + \s*(?P[^;]+) + $ + " + ) + .unwrap(); + }; + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid ArabicShaping line"), + }; + + Ok(ArabicShaping { + codepoint: caps["codepoint"].parse()?, + schematic_name: caps["name"].to_string(), + joining_type: caps["joining_type"].parse()?, + joining_group: caps["joining_group"].to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use crate::common::Codepoint; + + use super::{ArabicShaping, JoiningType}; + + fn codepoint(n: u32) -> Codepoint { + Codepoint::from_u32(n).unwrap() + } + + fn s(string: &str) -> String { + string.to_string() + } + + #[test] + fn parse1() { + let line = "0600; ARABIC NUMBER SIGN; U; No_Joining_Group\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x0600), + schematic_name: s("ARABIC NUMBER SIGN"), + joining_type: JoiningType::NonJoining, + joining_group: s("No_Joining_Group") + } + ); + } + + #[test] + fn parse2() { + let line = "063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x063D), + schematic_name: s("FARSI YEH WITH INVERTED V ABOVE"), + joining_type: JoiningType::DualJoining, + joining_group: s("FARSI YEH") + } + ); + } + + #[test] + fn parse3() { + let line = + "10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x10D23), + schematic_name: s( + "HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE" + ), + joining_type: JoiningType::DualJoining, + joining_group: s("HANIFI ROHINGYA KINNA YA") + } + ); + } +} diff --git a/vendor/ucd-parse/src/bidi_mirroring_glyph.rs b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs new file mode 100644 index 000000000..fcfefffcb --- /dev/null +++ b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs @@ -0,0 +1,107 @@ +use std::fmt; +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// Represents a single row in the `BidiMirroring.txt` file. +/// +/// The field names were taken from the header of BidiMirroring.txt. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct BidiMirroring { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The codepoint that has typically has a glyph that is the mirror image + /// of `codepoint`. + pub bidi_mirroring_glyph: Codepoint, +} + +impl UcdFile for BidiMirroring { + fn relative_file_path() -> &'static Path { + Path::new("BidiMirroring.txt") + } +} + +impl UcdFileByCodepoint for BidiMirroring { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for BidiMirroring { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P[A-F0-9]+)\s*; + \s*(?P[A-F0-9]+) + \s+ + \#(?:.+) + $ + " + ) + .unwrap(); + }; + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid BidiMirroring line"), + }; + + Ok(BidiMirroring { + codepoint: caps["codepoint"].parse()?, + bidi_mirroring_glyph: caps["substitute_codepoint"].parse()?, + }) + } +} + +impl fmt::Display for BidiMirroring { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{};", self.codepoint)?; + write!(f, "{};", self.bidi_mirroring_glyph)?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::common::Codepoint; + + use super::BidiMirroring; + + fn codepoint(n: u32) -> Codepoint { + Codepoint::from_u32(n).unwrap() + } + + #[test] + fn parse() { + let line = "0028; 0029 # LEFT PARENTHESIS\n"; + let data: BidiMirroring = line.parse().unwrap(); + assert_eq!( + data, + BidiMirroring { + codepoint: codepoint(0x0028), + bidi_mirroring_glyph: codepoint(0x0029), + } + ); + } + + #[test] + fn parse_best_fit() { + let line = "228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO\n"; + let data: BidiMirroring = line.parse().unwrap(); + assert_eq!( + data, + BidiMirroring { + codepoint: codepoint(0x228A), + bidi_mirroring_glyph: codepoint(0x228B), + } + ); + } +} diff --git a/vendor/ucd-parse/src/case_folding.rs b/vendor/ucd-parse/src/case_folding.rs new file mode 100644 index 000000000..813fc81a1 --- /dev/null +++ b/vendor/ucd-parse/src/case_folding.rs @@ -0,0 +1,161 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `CaseFolding.txt` file. +/// +/// The contents of `CaseFolding.txt` are a convenience derived from both +/// `UnicodeData.txt` and `SpecialCasing.txt`. +/// +/// Note that a single codepoint may be mapped multiple times. In particular, +/// a single codepoint might have distinct `CaseStatus::Simple` and +/// `CaseStatus::Full` mappings. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct CaseFold { + /// The codepoint that is being mapped. + pub codepoint: Codepoint, + /// The case status of this mapping. + pub status: CaseStatus, + /// The actual case mapping, which is more than one codepoint if this is + /// a "full" mapping. + pub mapping: Vec, +} + +impl UcdFile for CaseFold { + fn relative_file_path() -> &'static Path { + Path::new("CaseFolding.txt") + } +} + +impl UcdFileByCodepoint for CaseFold { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for CaseFold { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P[^\s;]+)\s*; + \s*(?P[^\s;]+)\s*; + \s*(?P[^;]+)\s*; + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid CaseFolding line: '{}'", line), + }; + let mut mapping = vec![]; + for cp in caps["mapping"].split_whitespace() { + mapping.push(cp.parse()?); + } + Ok(CaseFold { + codepoint: caps["codepoint"].parse()?, + status: caps["status"].parse()?, + mapping, + }) + } +} + +/// The status of a particular case mapping. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum CaseStatus { + /// Case mappings shared by both "simple" and "full" mappings. + Common, + /// A case mapping that changes the number of codepoints. + Full, + /// A case mapping that doesn't change the number of codepoints, when it + /// differs from `Full`. + Simple, + /// Special cases (currently only for Turkic mappings) that are typically + /// excluded by default. Special cases don't change the number of + /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes. + Special, +} + +impl Default for CaseStatus { + fn default() -> CaseStatus { + CaseStatus::Common + } +} + +impl CaseStatus { + /// Returns true if and only if this status indicates a case mapping that + /// won't change the number of codepoints. + pub fn is_fixed(&self) -> bool { + *self != CaseStatus::Full + } +} + +impl FromStr for CaseStatus { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "C" => Ok(CaseStatus::Common), + "F" => Ok(CaseStatus::Full), + "S" => Ok(CaseStatus::Simple), + "T" => Ok(CaseStatus::Special), + _ => err!( + "unrecognized case status: '{}' \ + (must be one of C, F, S or T)", + s + ), + } + } +} + +#[cfg(test)] +mod tests { + use super::{CaseFold, CaseStatus}; + + #[test] + fn parse_common() { + let line = + "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0150); + assert_eq!(row.status, CaseStatus::Common); + assert_eq!(row.mapping, vec![0x0151]); + } + + #[test] + fn parse_full() { + let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x03B0); + assert_eq!(row.status, CaseStatus::Full); + assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]); + } + + #[test] + fn parse_simple() { + let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x1F8F); + assert_eq!(row.status, CaseStatus::Simple); + assert_eq!(row.mapping, vec![0x1F87]); + } + + #[test] + fn parse_special() { + let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0049); + assert_eq!(row.status, CaseStatus::Special); + assert_eq!(row.mapping, vec![0x0131]); + } +} diff --git a/vendor/ucd-parse/src/common.rs b/vendor/ucd-parse/src/common.rs new file mode 100644 index 000000000..c18be668e --- /dev/null +++ b/vendor/ucd-parse/src/common.rs @@ -0,0 +1,594 @@ +use std::char; +use std::collections::BTreeMap; +use std::fmt; +use std::fs::File; +use std::io::{self, BufRead}; +use std::marker::PhantomData; +use std::path::{Path, PathBuf}; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::error::{Error, ErrorKind}; + +/// Parse a particular file in the UCD into a sequence of rows. +/// +/// The given directory should be the directory to the UCD. +pub fn parse(ucd_dir: P) -> Result, Error> +where + P: AsRef, + D: UcdFile, +{ + let mut xs = vec![]; + for result in D::from_dir(ucd_dir)? { + let x = result?; + xs.push(x); + } + Ok(xs) +} + +/// Parse a particular file in the UCD into a map from codepoint to the record. +/// +/// The given directory should be the directory to the UCD. +pub fn parse_by_codepoint( + ucd_dir: P, +) -> Result, Error> +where + P: AsRef, + D: UcdFileByCodepoint, +{ + let mut map = BTreeMap::new(); + for result in D::from_dir(ucd_dir)? { + let x = result?; + for cp in x.codepoints() { + map.insert(cp, x.clone()); + } + } + Ok(map) +} + +/// Parse a particular file in the UCD into a map from codepoint to all +/// records associated with that codepoint. +/// +/// This is useful for files that have multiple records for each codepoint. +/// For example, the `NameAliases.txt` file lists multiple aliases for some +/// codepoints. +/// +/// The given directory should be the directory to the UCD. +pub fn parse_many_by_codepoint( + ucd_dir: P, +) -> Result>, Error> +where + P: AsRef, + D: UcdFileByCodepoint, +{ + let mut map = BTreeMap::new(); + for result in D::from_dir(ucd_dir)? { + let x = result?; + for cp in x.codepoints() { + map.entry(cp).or_insert(vec![]).push(x.clone()); + } + } + Ok(map) +} + +/// Given a path pointing at the root of the `ucd_dir`, attempts to determine +/// it's unicode version. +/// +/// This just checks the readme and the very first line of PropList.txt -- in +/// practice this works for all versions of UCD since 4.1.0. +pub fn ucd_directory_version>( + ucd_dir: &D, +) -> Result<(u64, u64, u64), Error> { + // Avoid duplication from generic path parameter. + fn ucd_directory_version_inner( + ucd_dir: &Path, + ) -> Result<(u64, u64, u64), Error> { + lazy_static::lazy_static! { + static ref VERSION_RX: Regex = + Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap(); + } + + let proplist = ucd_dir.join("PropList.txt"); + let contents = first_line(&proplist)?; + let caps = match VERSION_RX.captures(&contents) { + Some(c) => c, + None => { + return err!("Failed to find version in line {:?}", contents) + } + }; + + let capture_to_num = |n| { + caps.get(n).unwrap().as_str().parse::().map_err(|e| Error { + kind: ErrorKind::Parse(format!( + "Failed to parse version from {:?} in PropList.txt: {}", + contents, e + )), + line: Some(0), + path: Some(proplist.clone()), + }) + }; + let major = capture_to_num(1)?; + let minor = capture_to_num(2)?; + let patch = capture_to_num(3)?; + + Ok((major, minor, patch)) + } + ucd_directory_version_inner(ucd_dir.as_ref()) +} + +fn first_line(path: &Path) -> Result { + let file = std::fs::File::open(path).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.into()), + })?; + + let mut reader = std::io::BufReader::new(file); + let mut line_contents = String::new(); + reader.read_line(&mut line_contents).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.into()), + })?; + Ok(line_contents) +} + +/// A helper function for parsing a common record format that associates one +/// or more codepoints with a string value. +pub fn parse_codepoint_association<'a>( + line: &'a str, +) -> Result<(Codepoints, &'a str), Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P[^\s;]+)\s*; + \s*(?P[^;\x23]+)\s* + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid PropList line: '{}'", line), + }; + let property = match caps.name("property") { + Some(property) => property.as_str().trim(), + None => { + return err!( + "could not find property name in PropList line: '{}'", + line + ) + } + }; + Ok((caps["codepoints"].parse()?, property)) +} + +/// A helper function for parsing a sequence of space separated codepoints. +/// The sequence is permitted to be empty. +pub fn parse_codepoint_sequence(s: &str) -> Result, Error> { + let mut cps = vec![]; + for cp in s.trim().split_whitespace() { + cps.push(cp.parse()?); + } + Ok(cps) +} + +/// A helper function for parsing a single test for the various break +/// algorithms. +/// +/// Upon success, this returns the UTF-8 encoded groups of codepoints along +/// with the comment associated with the test. The comment is a human readable +/// description of the test that may prove useful for debugging. +pub fn parse_break_test(line: &str) -> Result<(Vec, String), Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?:÷|×) + (?P(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+) + \s+ + \#(?P.+) + $ + " + ) + .unwrap(); + static ref GROUP: Regex = Regex::new( + r"(?x) + (?P[0-9A-Fa-f]{4,5})\s(?P÷|×) + " + ) + .unwrap(); + } + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid break test line: '{}'", line), + }; + let comment = caps["comment"].trim().to_string(); + + let mut groups = vec![]; + let mut cur = String::new(); + for cap in GROUP.captures_iter(&caps["groups"]) { + let cp: Codepoint = cap["codepoint"].parse()?; + let ch = match cp.scalar() { + Some(ch) => ch, + None => { + return err!( + "invalid codepoint '{:X}' in line: '{}'", + cp.value(), + line + ) + } + }; + cur.push(ch); + if &cap["kind"] == "÷" { + groups.push(cur); + cur = String::new(); + } + } + Ok((groups, comment)) +} + +/// Describes a single UCD file. +pub trait UcdFile: + Clone + fmt::Debug + Default + Eq + FromStr + PartialEq +{ + /// The file path corresponding to this file, relative to the UCD + /// directory. + fn relative_file_path() -> &'static Path; + + /// The full file path corresponding to this file given the UCD directory + /// path. + fn file_path>(ucd_dir: P) -> PathBuf { + ucd_dir.as_ref().join(Self::relative_file_path()) + } + + /// Create an iterator over each record in this UCD file. + /// + /// The parameter should correspond to the directory containing the UCD. + fn from_dir>( + ucd_dir: P, + ) -> Result, Error> { + UcdLineParser::from_path(Self::file_path(ucd_dir)) + } +} + +/// Describes a single UCD file where every record in the file is associated +/// with one or more codepoints. +pub trait UcdFileByCodepoint: UcdFile { + /// Returns the codepoints associated with this record. + fn codepoints(&self) -> CodepointIter; +} + +/// A line oriented parser for a particular UCD file. +/// +/// Callers can build a line parser via the +/// [`UcdFile::from_dir`](trait.UcdFile.html) method. +/// +/// The `R` type parameter refers to the underlying `io::Read` implementation +/// from which the UCD data is read. +/// +/// The `D` type parameter refers to the type of the record parsed out of each +/// line. +#[derive(Debug)] +pub struct UcdLineParser { + path: Option, + rdr: io::BufReader, + line: String, + line_number: u64, + _data: PhantomData, +} + +impl UcdLineParser { + /// Create a new parser from the given file path. + pub(crate) fn from_path>( + path: P, + ) -> Result, Error> { + let path = path.as_ref(); + let file = File::open(path).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.to_path_buf()), + })?; + Ok(UcdLineParser::new(Some(path.to_path_buf()), file)) + } +} + +impl UcdLineParser { + /// Create a new parser that parses the reader given. + /// + /// The type of data parsed is determined when the `parse_next` function + /// is called by virtue of the type requested. + /// + /// Note that the reader is buffered internally, so the caller does not + /// need to provide their own buffering. + pub(crate) fn new(path: Option, rdr: R) -> UcdLineParser { + UcdLineParser { + path, + rdr: io::BufReader::new(rdr), + line: String::new(), + line_number: 0, + _data: PhantomData, + } + } +} + +impl> Iterator for UcdLineParser { + type Item = Result; + + fn next(&mut self) -> Option> { + loop { + self.line_number += 1; + self.line.clear(); + let n = match self.rdr.read_line(&mut self.line) { + Err(err) => { + return Some(Err(Error { + kind: ErrorKind::Io(err), + line: None, + path: self.path.clone(), + })) + } + Ok(n) => n, + }; + if n == 0 { + return None; + } + if !self.line.starts_with('#') && !self.line.trim().is_empty() { + break; + } + } + let line_number = self.line_number; + Some(self.line.parse().map_err(|mut err: Error| { + err.line = Some(line_number); + err + })) + } +} + +/// A representation of either a single codepoint or a range of codepoints. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub enum Codepoints { + /// A single codepoint. + Single(Codepoint), + /// A range of codepoints. + Range(CodepointRange), +} + +impl Default for Codepoints { + fn default() -> Codepoints { + Codepoints::Single(Codepoint::default()) + } +} + +impl IntoIterator for Codepoints { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + match self { + Codepoints::Single(x) => x.into_iter(), + Codepoints::Range(x) => x.into_iter(), + } + } +} + +impl FromStr for Codepoints { + type Err = Error; + + fn from_str(s: &str) -> Result { + if s.contains("..") { + CodepointRange::from_str(s).map(Codepoints::Range) + } else { + Codepoint::from_str(s).map(Codepoints::Single) + } + } +} + +impl fmt::Display for Codepoints { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Codepoints::Single(ref x) => x.fmt(f), + Codepoints::Range(ref x) => x.fmt(f), + } + } +} + +impl PartialEq for Codepoints { + fn eq(&self, other: &u32) -> bool { + match *self { + Codepoints::Single(ref x) => x == other, + Codepoints::Range(ref x) => x == &(*other, *other), + } + } +} + +impl PartialEq for Codepoints { + fn eq(&self, other: &Codepoint) -> bool { + match *self { + Codepoints::Single(ref x) => x == other, + Codepoints::Range(ref x) => x == &(*other, *other), + } + } +} + +impl PartialEq<(u32, u32)> for Codepoints { + fn eq(&self, other: &(u32, u32)) -> bool { + match *self { + Codepoints::Single(ref x) => &(x.value(), x.value()) == other, + Codepoints::Range(ref x) => x == other, + } + } +} + +impl PartialEq<(Codepoint, Codepoint)> for Codepoints { + fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { + match *self { + Codepoints::Single(ref x) => &(*x, *x) == other, + Codepoints::Range(ref x) => x == other, + } + } +} + +/// A range of Unicode codepoints. The range is inclusive; both ends of the +/// range are guaranteed to be valid codepoints. +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub struct CodepointRange { + /// The start of the codepoint range. + pub start: Codepoint, + /// The end of the codepoint range. + pub end: Codepoint, +} + +impl IntoIterator for CodepointRange { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + CodepointIter { next: self.start.value(), range: self } + } +} + +impl FromStr for CodepointRange { + type Err = Error; + + fn from_str(s: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = + Regex::new(r"^(?P[A-Z0-9]+)\.\.(?P[A-Z0-9]+)$") + .unwrap(); + } + let caps = match PARTS.captures(s) { + Some(caps) => caps, + None => return err!("invalid codepoint range: '{}'", s), + }; + let start = caps["start"].parse().or_else(|err| { + err!("failed to parse '{}' as a codepoint range: {}", s, err) + })?; + let end = caps["end"].parse().or_else(|err| { + err!("failed to parse '{}' as a codepoint range: {}", s, err) + })?; + Ok(CodepointRange { start, end }) + } +} + +impl fmt::Display for CodepointRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +impl PartialEq<(u32, u32)> for CodepointRange { + fn eq(&self, other: &(u32, u32)) -> bool { + &(self.start.value(), self.end.value()) == other + } +} + +impl PartialEq<(Codepoint, Codepoint)> for CodepointRange { + fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { + &(self.start, self.end) == other + } +} + +/// A single Unicode codepoint. +/// +/// This type's string representation is a hexadecimal number. It is guaranteed +/// to be in the range `[0, 10FFFF]`. +/// +/// Note that unlike Rust's `char` type, this may be a surrogate codepoint. +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub struct Codepoint(u32); + +impl Codepoint { + /// Create a new codepoint from a `u32`. + /// + /// If the given number is not a valid codepoint, then this returns an + /// error. + pub fn from_u32(n: u32) -> Result { + if n > 0x10FFFF { + err!("{:x} is not a valid Unicode codepoint", n) + } else { + Ok(Codepoint(n)) + } + } + + /// Return the underlying `u32` codepoint value. + pub fn value(self) -> u32 { + self.0 + } + + /// Attempt to convert this codepoint to a Unicode scalar value. + /// + /// If this is a surrogate codepoint, then this returns `None`. + pub fn scalar(self) -> Option { + char::from_u32(self.0) + } +} + +impl IntoIterator for Codepoint { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + let range = CodepointRange { start: self, end: self }; + CodepointIter { next: self.value(), range } + } +} + +impl FromStr for Codepoint { + type Err = Error; + + fn from_str(s: &str) -> Result { + match u32::from_str_radix(s, 16) { + Ok(n) => Codepoint::from_u32(n), + Err(err) => { + return err!( + "failed to parse '{}' as a hexadecimal codepoint: {}", + s, + err + ); + } + } + } +} + +impl fmt::Display for Codepoint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:04X}", self.0) + } +} + +impl PartialEq for Codepoint { + fn eq(&self, other: &u32) -> bool { + self.0 == *other + } +} + +impl PartialEq for u32 { + fn eq(&self, other: &Codepoint) -> bool { + *self == other.0 + } +} + +/// An iterator over a range of Unicode codepoints. +#[derive(Debug)] +pub struct CodepointIter { + next: u32, + range: CodepointRange, +} + +impl Iterator for CodepointIter { + type Item = Codepoint; + + fn next(&mut self) -> Option { + if self.next > self.range.end.value() { + return None; + } + let current = self.next; + self.next += 1; + Some(Codepoint::from_u32(current).unwrap()) + } +} diff --git a/vendor/ucd-parse/src/core_properties.rs b/vendor/ucd-parse/src/core_properties.rs new file mode 100644 index 000000000..9a7682b43 --- /dev/null +++ b/vendor/ucd-parse/src/core_properties.rs @@ -0,0 +1,60 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `DerivedCoreProperties.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct CoreProperty { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property name assigned to the codepoints in this entry. + pub property: String, +} + +impl UcdFile for CoreProperty { + fn relative_file_path() -> &'static Path { + Path::new("DerivedCoreProperties.txt") + } +} + +impl UcdFileByCodepoint for CoreProperty { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for CoreProperty { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, property) = parse_codepoint_association(line)?; + Ok(CoreProperty { codepoints, property: property.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::CoreProperty; + + #[test] + fn parse_single() { + let line = + "1163D ; Case_Ignorable # Mn MODI SIGN ANUSVARA\n"; + let row: CoreProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x1163D); + assert_eq!(row.property, "Case_Ignorable"); + } + + #[test] + fn parse_range() { + let line = "11133..11134 ; Grapheme_Link # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA\n"; + let row: CoreProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x11133, 0x11134)); + assert_eq!(row.property, "Grapheme_Link"); + } +} diff --git a/vendor/ucd-parse/src/emoji_properties.rs b/vendor/ucd-parse/src/emoji_properties.rs new file mode 100644 index 000000000..dc5c0c884 --- /dev/null +++ b/vendor/ucd-parse/src/emoji_properties.rs @@ -0,0 +1,86 @@ +use std::path::{Path, PathBuf}; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `emoji-data.txt` file. +/// +/// The `emoji-data.txt` file is the source of truth on several Emoji-related +/// Unicode properties. +/// +/// Note that `emoji-data.txt` is not formally part of the Unicode Character +/// Database. You can download the Emoji data files separately here: +/// https://unicode.org/Public/emoji/ +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct EmojiProperty { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property name assigned to the codepoints in this entry. + pub property: String, +} + +impl UcdFile for EmojiProperty { + fn relative_file_path() -> &'static Path { + Path::new("emoji/emoji-data.txt") + } + + fn file_path>(ucd_dir: P) -> PathBuf { + let ucd_dir = ucd_dir.as_ref(); + // The standard location, but only on UCDs from 13.0.0 and up. + let std = ucd_dir.join(Self::relative_file_path()); + if std.exists() { + std + } else { + // If the old location does exist, use it. + let legacy = ucd_dir.join("emoji-data.txt"); + if legacy.exists() { + legacy + } else { + // This might end up in an error message, so use the standard + // one if forced to choose. Arguably we could do something like + // peek + std + } + } + } +} + +impl UcdFileByCodepoint for EmojiProperty { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for EmojiProperty { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, property) = parse_codepoint_association(line)?; + Ok(EmojiProperty { codepoints, property: property.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::EmojiProperty; + + #[test] + fn parse_single() { + let line = "24C2 ; Emoji # 1.1 [1] (Ⓜ️) circled M\n"; + let row: EmojiProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x24C2); + assert_eq!(row.property, "Emoji"); + } + + #[test] + fn parse_range() { + let line = "1FA6E..1FFFD ; Extended_Pictographic# NA[1424] (🩮️..🿽️) ..\n"; + let row: EmojiProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x1FA6E, 0x1FFFD)); + assert_eq!(row.property, "Extended_Pictographic"); + } +} diff --git a/vendor/ucd-parse/src/error.rs b/vendor/ucd-parse/src/error.rs new file mode 100644 index 000000000..9dafc4b33 --- /dev/null +++ b/vendor/ucd-parse/src/error.rs @@ -0,0 +1,86 @@ +use std::error; +use std::fmt; +use std::io; +use std::path::{Path, PathBuf}; + +/// Represents any kind of error that can occur while parsing the UCD. +#[derive(Debug)] +pub struct Error { + pub(crate) kind: ErrorKind, + pub(crate) line: Option, + pub(crate) path: Option, +} + +/// The kind of error that occurred while parsing the UCD. +#[derive(Debug)] +pub enum ErrorKind { + /// An I/O error. + Io(io::Error), + /// A generic parse error. + Parse(String), +} + +impl Error { + /// Create a new parse error from the given message. + pub(crate) fn parse(msg: String) -> Error { + Error { kind: ErrorKind::Parse(msg), line: None, path: None } + } + + /// Return the specific kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// Return the line number at which this error occurred, if available. + pub fn line(&self) -> Option { + self.line + } + + /// Return the file path associated with this error, if one exists. + pub fn path(&self) -> Option<&Path> { + self.path.as_ref().map(|p| &**p) + } + + /// Unwrap this error into its underlying kind. + pub fn into_kind(self) -> ErrorKind { + self.kind + } + + /// Returns true if and only if this is an I/O error. + /// + /// If this returns true, the underlying `ErrorKind` is guaranteed to be + /// `ErrorKind::Io`. + pub fn is_io_error(&self) -> bool { + match self.kind { + ErrorKind::Io(_) => true, + _ => false, + } + } +} + +impl error::Error for Error { + fn cause(&self) -> Option<&dyn error::Error> { + match self.kind { + ErrorKind::Io(ref err) => Some(err), + _ => None, + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(ref path) = self.path { + if let Some(line) = self.line { + write!(f, "{}:{}: ", path.display(), line)?; + } else { + write!(f, "{}: ", path.display())?; + } + } else if let Some(line) = self.line { + write!(f, "error on line {}: ", line)?; + } + match self.kind { + ErrorKind::Io(ref err) => write!(f, "{}", err), + ErrorKind::Parse(ref msg) => write!(f, "{}", msg), + } + } +} diff --git a/vendor/ucd-parse/src/grapheme_cluster_break.rs b/vendor/ucd-parse/src/grapheme_cluster_break.rs new file mode 100644 index 000000000..9dbf32f41 --- /dev/null +++ b/vendor/ucd-parse/src/grapheme_cluster_break.rs @@ -0,0 +1,98 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_break_test, parse_codepoint_association, CodepointIter, Codepoints, + UcdFile, UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `auxiliary/GraphemeBreakProperty.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct GraphemeClusterBreak { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property value assigned to the codepoints in this entry. + pub value: String, +} + +impl UcdFile for GraphemeClusterBreak { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/GraphemeBreakProperty.txt") + } +} + +impl UcdFileByCodepoint for GraphemeClusterBreak { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for GraphemeClusterBreak { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, value) = parse_codepoint_association(line)?; + Ok(GraphemeClusterBreak { codepoints, value: value.to_string() }) + } +} + +/// A single row in the `auxiliary/GraphemeBreakTest.txt` file. +/// +/// This file defines tests for the grapheme cluster break algorithm. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct GraphemeClusterBreakTest { + /// Each string is a UTF-8 encoded group of codepoints that make up a + /// single grapheme cluster. + pub grapheme_clusters: Vec, + /// A human readable description of this test. + pub comment: String, +} + +impl UcdFile for GraphemeClusterBreakTest { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/GraphemeBreakTest.txt") + } +} + +impl FromStr for GraphemeClusterBreakTest { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (groups, comment) = parse_break_test(line)?; + Ok(GraphemeClusterBreakTest { grapheme_clusters: groups, comment }) + } +} + +#[cfg(test)] +mod tests { + use super::{GraphemeClusterBreak, GraphemeClusterBreakTest}; + + #[test] + fn parse_single() { + let line = "093B ; SpacingMark # Mc DEVANAGARI VOWEL SIGN OOE\n"; + let row: GraphemeClusterBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x093B); + assert_eq!(row.value, "SpacingMark"); + } + + #[test] + fn parse_range() { + let line = "1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z\n"; + let row: GraphemeClusterBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x1F1E6, 0x1F1FF)); + assert_eq!(row.value, "Regional_Indicator"); + } + + #[test] + fn parse_test() { + let line = "÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]\n"; + + let row: GraphemeClusterBreakTest = line.parse().unwrap(); + assert_eq!( + row.grapheme_clusters, + vec!["\u{0061}\u{1F3FF}", "\u{1F476}\u{200D}\u{1F6D1}",] + ); + assert!(row.comment.starts_with("÷ [0.2] LATIN SMALL LETTER A")); + } +} diff --git a/vendor/ucd-parse/src/jamo_short_name.rs b/vendor/ucd-parse/src/jamo_short_name.rs new file mode 100644 index 000000000..4103dd7ee --- /dev/null +++ b/vendor/ucd-parse/src/jamo_short_name.rs @@ -0,0 +1,80 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `Jamo.txt` file. +/// +/// The `Jamo.txt` file defines the `Jamo_Short_Name` property. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct JamoShortName { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The actual "Jamo Short Name." This string contains at most 3 bytes and + /// may be empty. + pub name: String, +} + +impl UcdFile for JamoShortName { + fn relative_file_path() -> &'static Path { + Path::new("Jamo.txt") + } +} + +impl UcdFileByCodepoint for JamoShortName { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for JamoShortName { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?P[A-Z0-9]+); + \s* + (?P[A-Z]*) + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid Jamo_Short_name line"), + }; + Ok(JamoShortName { + codepoint: caps["codepoint"].parse()?, + name: caps.name("name").unwrap().as_str().to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::JamoShortName; + + #[test] + fn parse1() { + let line = "1164; YAE # HANGUL JUNGSEONG YAE\n"; + let row: JamoShortName = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x1164); + assert_eq!(row.name, "YAE"); + } + + #[test] + fn parse2() { + let line = "110B; # HANGUL CHOSEONG IEUNG\n"; + let row: JamoShortName = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x110B); + assert_eq!(row.name, ""); + } +} diff --git a/vendor/ucd-parse/src/lib.rs b/vendor/ucd-parse/src/lib.rs new file mode 100644 index 000000000..f6654658a --- /dev/null +++ b/vendor/ucd-parse/src/lib.rs @@ -0,0 +1,66 @@ +/*! +A library for parsing the Unicode character database. +*/ + +#![deny(missing_docs)] + +pub use crate::common::{ + parse, parse_by_codepoint, parse_many_by_codepoint, ucd_directory_version, + Codepoint, CodepointIter, CodepointRange, Codepoints, UcdFile, + UcdFileByCodepoint, UcdLineParser, +}; +pub use crate::error::{Error, ErrorKind}; + +pub use crate::age::Age; +pub use crate::arabic_shaping::ArabicShaping; +pub use crate::bidi_mirroring_glyph::BidiMirroring; +pub use crate::case_folding::{CaseFold, CaseStatus}; +pub use crate::core_properties::CoreProperty; +pub use crate::emoji_properties::EmojiProperty; +pub use crate::grapheme_cluster_break::{ + GraphemeClusterBreak, GraphemeClusterBreakTest, +}; +pub use crate::jamo_short_name::JamoShortName; +pub use crate::line_break::LineBreakTest; +pub use crate::name_aliases::{NameAlias, NameAliasLabel}; +pub use crate::prop_list::Property; +pub use crate::property_aliases::PropertyAlias; +pub use crate::property_value_aliases::PropertyValueAlias; +pub use crate::script_extensions::ScriptExtension; +pub use crate::scripts::Script; +pub use crate::sentence_break::{SentenceBreak, SentenceBreakTest}; +pub use crate::special_casing::SpecialCaseMapping; +pub use crate::unicode_data::{ + UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, + UnicodeDataExpander, UnicodeDataNumeric, +}; +pub use crate::word_break::{WordBreak, WordBreakTest}; + +macro_rules! err { + ($($tt:tt)*) => { + Err(crate::error::Error::parse(format!($($tt)*))) + } +} + +mod common; +mod error; + +mod age; +mod arabic_shaping; +mod bidi_mirroring_glyph; +mod case_folding; +mod core_properties; +mod emoji_properties; +mod grapheme_cluster_break; +mod jamo_short_name; +mod line_break; +mod name_aliases; +mod prop_list; +mod property_aliases; +mod property_value_aliases; +mod script_extensions; +mod scripts; +mod sentence_break; +mod special_casing; +mod unicode_data; +mod word_break; diff --git a/vendor/ucd-parse/src/line_break.rs b/vendor/ucd-parse/src/line_break.rs new file mode 100644 index 000000000..aa62fcb9e --- /dev/null +++ b/vendor/ucd-parse/src/line_break.rs @@ -0,0 +1,49 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{parse_break_test, UcdFile}; +use crate::error::Error; + +/// A single row in the `auxiliary/LineBreakTest.txt` file. +/// +/// This file defines tests for the line break algorithm. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct LineBreakTest { + /// Each string is a UTF-8 encoded group of codepoints that make up a + /// single line. + pub lines: Vec, + /// A human readable description of this test. + pub comment: String, +} + +impl UcdFile for LineBreakTest { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/LineBreakTest.txt") + } +} + +impl FromStr for LineBreakTest { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (groups, comment) = parse_break_test(line)?; + Ok(LineBreakTest { lines: groups, comment }) + } +} + +#[cfg(test)] +mod tests { + use super::LineBreakTest; + + #[test] + fn parse_test() { + let line = "× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [30.13] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]"; + + let row: LineBreakTest = line.parse().unwrap(); + assert_eq!( + row.lines, + vec!["\u{1F1F7}\u{1F1FA}", "\u{1F1F8}\u{1F1EA}",] + ); + assert!(row.comment.ends_with("(RI) ÷ [0.3]")); + } +} diff --git a/vendor/ucd-parse/src/name_aliases.rs b/vendor/ucd-parse/src/name_aliases.rs new file mode 100644 index 000000000..36c9c4b01 --- /dev/null +++ b/vendor/ucd-parse/src/name_aliases.rs @@ -0,0 +1,145 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `NameAliases.txt` file. +/// +/// Note that there are multiple rows for some codepoint. Each row provides a +/// new alias. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct NameAlias { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The alias. + pub alias: String, + /// The label of this alias. + pub label: NameAliasLabel, +} + +impl UcdFile for NameAlias { + fn relative_file_path() -> &'static Path { + Path::new("NameAliases.txt") + } +} + +impl UcdFileByCodepoint for NameAlias { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for NameAlias { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?P[A-Z0-9]+); + \s* + (?P[^;]+); + \s* + (?P