From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/ucd-parse/src/age.rs | 59 ++ vendor/ucd-parse/src/arabic_shaping.rs | 184 ++++++ vendor/ucd-parse/src/bidi_mirroring_glyph.rs | 107 ++++ vendor/ucd-parse/src/case_folding.rs | 161 +++++ vendor/ucd-parse/src/common.rs | 594 +++++++++++++++++++ vendor/ucd-parse/src/core_properties.rs | 60 ++ vendor/ucd-parse/src/emoji_properties.rs | 86 +++ vendor/ucd-parse/src/error.rs | 86 +++ vendor/ucd-parse/src/grapheme_cluster_break.rs | 98 +++ vendor/ucd-parse/src/jamo_short_name.rs | 80 +++ vendor/ucd-parse/src/lib.rs | 66 +++ vendor/ucd-parse/src/line_break.rs | 49 ++ vendor/ucd-parse/src/name_aliases.rs | 145 +++++ vendor/ucd-parse/src/prop_list.rs | 63 ++ vendor/ucd-parse/src/property_aliases.rs | 113 ++++ vendor/ucd-parse/src/property_value_aliases.rs | 185 ++++++ vendor/ucd-parse/src/script_extensions.rs | 68 +++ vendor/ucd-parse/src/scripts.rs | 59 ++ vendor/ucd-parse/src/sentence_break.rs | 101 ++++ vendor/ucd-parse/src/special_casing.rs | 112 ++++ vendor/ucd-parse/src/unicode_data.rs | 787 +++++++++++++++++++++++++ vendor/ucd-parse/src/word_break.rs | 103 ++++ 22 files changed, 3366 insertions(+) create mode 100644 vendor/ucd-parse/src/age.rs create mode 100644 vendor/ucd-parse/src/arabic_shaping.rs create mode 100644 vendor/ucd-parse/src/bidi_mirroring_glyph.rs create mode 100644 vendor/ucd-parse/src/case_folding.rs create mode 100644 vendor/ucd-parse/src/common.rs create mode 100644 vendor/ucd-parse/src/core_properties.rs create mode 100644 vendor/ucd-parse/src/emoji_properties.rs create mode 100644 vendor/ucd-parse/src/error.rs create mode 100644 vendor/ucd-parse/src/grapheme_cluster_break.rs create mode 100644 vendor/ucd-parse/src/jamo_short_name.rs create mode 100644 vendor/ucd-parse/src/lib.rs create mode 100644 vendor/ucd-parse/src/line_break.rs create mode 100644 vendor/ucd-parse/src/name_aliases.rs create mode 100644 vendor/ucd-parse/src/prop_list.rs create mode 100644 vendor/ucd-parse/src/property_aliases.rs create mode 100644 vendor/ucd-parse/src/property_value_aliases.rs create mode 100644 vendor/ucd-parse/src/script_extensions.rs create mode 100644 vendor/ucd-parse/src/scripts.rs create mode 100644 vendor/ucd-parse/src/sentence_break.rs create mode 100644 vendor/ucd-parse/src/special_casing.rs create mode 100644 vendor/ucd-parse/src/unicode_data.rs create mode 100644 vendor/ucd-parse/src/word_break.rs (limited to 'vendor/ucd-parse/src') diff --git a/vendor/ucd-parse/src/age.rs b/vendor/ucd-parse/src/age.rs new file mode 100644 index 000000000..3c93f0707 --- /dev/null +++ b/vendor/ucd-parse/src/age.rs @@ -0,0 +1,59 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `DerivedAge.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct Age { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The age assigned to the codepoints in this entry. + pub age: String, +} + +impl UcdFile for Age { + fn relative_file_path() -> &'static Path { + Path::new("DerivedAge.txt") + } +} + +impl UcdFileByCodepoint for Age { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for Age { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, script) = parse_codepoint_association(line)?; + Ok(Age { codepoints, age: script.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::Age; + + #[test] + fn parse_single() { + let line = "2BD2 ; 10.0 # GROUP MARK\n"; + let row: Age = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x2BD2); + assert_eq!(row.age, "10.0"); + } + + #[test] + fn parse_range() { + let line = "11D0B..11D36 ; 10.0 # [44] MASARAM GONDI LETTER AU..MASARAM GONDI VOWEL SIGN VOCALIC R\n"; + let row: Age = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x11D0B, 0x11D36)); + assert_eq!(row.age, "10.0"); + } +} diff --git a/vendor/ucd-parse/src/arabic_shaping.rs b/vendor/ucd-parse/src/arabic_shaping.rs new file mode 100644 index 000000000..d1d942a82 --- /dev/null +++ b/vendor/ucd-parse/src/arabic_shaping.rs @@ -0,0 +1,184 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// Represents a single row in the `ArabicShaping.txt` file. +/// +/// The field names were taken from the header of ArabicShaping.txt. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct ArabicShaping { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// A short schematic name for the codepoint. + /// + /// The schematic name is descriptive of the shape, based as consistently as + /// possible on a name for the skeleton and then the diacritic marks applied + /// to the skeleton, if any. Note that this schematic name is considered a + /// comment, and does not constitute a formal property value. + pub schematic_name: String, + /// The "joining type" of this codepoint. + pub joining_type: JoiningType, + /// The "joining group" of this codepoint. + pub joining_group: String, +} + +/// The Joining_Type field read from ArabicShaping.txt +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum JoiningType { + RightJoining, + LeftJoining, + DualJoining, + JoinCausing, + NonJoining, + Transparent, +} + +impl JoiningType { + pub fn as_str(&self) -> &str { + match self { + JoiningType::RightJoining => "R", + JoiningType::LeftJoining => "L", + JoiningType::DualJoining => "D", + JoiningType::JoinCausing => "C", + JoiningType::NonJoining => "U", + JoiningType::Transparent => "T", + } + } +} + +impl Default for JoiningType { + fn default() -> JoiningType { + JoiningType::NonJoining + } +} + +impl FromStr for JoiningType { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "R" => Ok(JoiningType::RightJoining), + "L" => Ok(JoiningType::LeftJoining), + "D" => Ok(JoiningType::DualJoining), + "C" => Ok(JoiningType::JoinCausing), + "U" => Ok(JoiningType::NonJoining), + "T" => Ok(JoiningType::Transparent), + _ => err!( + "unrecognized joining type: '{}' \ + (must be one of R, L, D, C, U or T)", + s + ), + } + } +} + +impl UcdFile for ArabicShaping { + fn relative_file_path() -> &'static Path { + Path::new("ArabicShaping.txt") + } +} + +impl UcdFileByCodepoint for ArabicShaping { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for ArabicShaping { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P[A-F0-9]+)\s*; + \s*(?P[^;]+)\s*; + \s*(?P[^;]+)\s*; + \s*(?P[^;]+) + $ + " + ) + .unwrap(); + }; + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid ArabicShaping line"), + }; + + Ok(ArabicShaping { + codepoint: caps["codepoint"].parse()?, + schematic_name: caps["name"].to_string(), + joining_type: caps["joining_type"].parse()?, + joining_group: caps["joining_group"].to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use crate::common::Codepoint; + + use super::{ArabicShaping, JoiningType}; + + fn codepoint(n: u32) -> Codepoint { + Codepoint::from_u32(n).unwrap() + } + + fn s(string: &str) -> String { + string.to_string() + } + + #[test] + fn parse1() { + let line = "0600; ARABIC NUMBER SIGN; U; No_Joining_Group\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x0600), + schematic_name: s("ARABIC NUMBER SIGN"), + joining_type: JoiningType::NonJoining, + joining_group: s("No_Joining_Group") + } + ); + } + + #[test] + fn parse2() { + let line = "063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x063D), + schematic_name: s("FARSI YEH WITH INVERTED V ABOVE"), + joining_type: JoiningType::DualJoining, + joining_group: s("FARSI YEH") + } + ); + } + + #[test] + fn parse3() { + let line = + "10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA\n"; + let data: ArabicShaping = line.parse().unwrap(); + assert_eq!( + data, + ArabicShaping { + codepoint: codepoint(0x10D23), + schematic_name: s( + "HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE" + ), + joining_type: JoiningType::DualJoining, + joining_group: s("HANIFI ROHINGYA KINNA YA") + } + ); + } +} diff --git a/vendor/ucd-parse/src/bidi_mirroring_glyph.rs b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs new file mode 100644 index 000000000..fcfefffcb --- /dev/null +++ b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs @@ -0,0 +1,107 @@ +use std::fmt; +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// Represents a single row in the `BidiMirroring.txt` file. +/// +/// The field names were taken from the header of BidiMirroring.txt. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct BidiMirroring { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The codepoint that has typically has a glyph that is the mirror image + /// of `codepoint`. + pub bidi_mirroring_glyph: Codepoint, +} + +impl UcdFile for BidiMirroring { + fn relative_file_path() -> &'static Path { + Path::new("BidiMirroring.txt") + } +} + +impl UcdFileByCodepoint for BidiMirroring { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for BidiMirroring { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P[A-F0-9]+)\s*; + \s*(?P[A-F0-9]+) + \s+ + \#(?:.+) + $ + " + ) + .unwrap(); + }; + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid BidiMirroring line"), + }; + + Ok(BidiMirroring { + codepoint: caps["codepoint"].parse()?, + bidi_mirroring_glyph: caps["substitute_codepoint"].parse()?, + }) + } +} + +impl fmt::Display for BidiMirroring { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{};", self.codepoint)?; + write!(f, "{};", self.bidi_mirroring_glyph)?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::common::Codepoint; + + use super::BidiMirroring; + + fn codepoint(n: u32) -> Codepoint { + Codepoint::from_u32(n).unwrap() + } + + #[test] + fn parse() { + let line = "0028; 0029 # LEFT PARENTHESIS\n"; + let data: BidiMirroring = line.parse().unwrap(); + assert_eq!( + data, + BidiMirroring { + codepoint: codepoint(0x0028), + bidi_mirroring_glyph: codepoint(0x0029), + } + ); + } + + #[test] + fn parse_best_fit() { + let line = "228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO\n"; + let data: BidiMirroring = line.parse().unwrap(); + assert_eq!( + data, + BidiMirroring { + codepoint: codepoint(0x228A), + bidi_mirroring_glyph: codepoint(0x228B), + } + ); + } +} diff --git a/vendor/ucd-parse/src/case_folding.rs b/vendor/ucd-parse/src/case_folding.rs new file mode 100644 index 000000000..813fc81a1 --- /dev/null +++ b/vendor/ucd-parse/src/case_folding.rs @@ -0,0 +1,161 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `CaseFolding.txt` file. +/// +/// The contents of `CaseFolding.txt` are a convenience derived from both +/// `UnicodeData.txt` and `SpecialCasing.txt`. +/// +/// Note that a single codepoint may be mapped multiple times. In particular, +/// a single codepoint might have distinct `CaseStatus::Simple` and +/// `CaseStatus::Full` mappings. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct CaseFold { + /// The codepoint that is being mapped. + pub codepoint: Codepoint, + /// The case status of this mapping. + pub status: CaseStatus, + /// The actual case mapping, which is more than one codepoint if this is + /// a "full" mapping. + pub mapping: Vec, +} + +impl UcdFile for CaseFold { + fn relative_file_path() -> &'static Path { + Path::new("CaseFolding.txt") + } +} + +impl UcdFileByCodepoint for CaseFold { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for CaseFold { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P[^\s;]+)\s*; + \s*(?P[^\s;]+)\s*; + \s*(?P[^;]+)\s*; + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid CaseFolding line: '{}'", line), + }; + let mut mapping = vec![]; + for cp in caps["mapping"].split_whitespace() { + mapping.push(cp.parse()?); + } + Ok(CaseFold { + codepoint: caps["codepoint"].parse()?, + status: caps["status"].parse()?, + mapping, + }) + } +} + +/// The status of a particular case mapping. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum CaseStatus { + /// Case mappings shared by both "simple" and "full" mappings. + Common, + /// A case mapping that changes the number of codepoints. + Full, + /// A case mapping that doesn't change the number of codepoints, when it + /// differs from `Full`. + Simple, + /// Special cases (currently only for Turkic mappings) that are typically + /// excluded by default. Special cases don't change the number of + /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes. + Special, +} + +impl Default for CaseStatus { + fn default() -> CaseStatus { + CaseStatus::Common + } +} + +impl CaseStatus { + /// Returns true if and only if this status indicates a case mapping that + /// won't change the number of codepoints. + pub fn is_fixed(&self) -> bool { + *self != CaseStatus::Full + } +} + +impl FromStr for CaseStatus { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "C" => Ok(CaseStatus::Common), + "F" => Ok(CaseStatus::Full), + "S" => Ok(CaseStatus::Simple), + "T" => Ok(CaseStatus::Special), + _ => err!( + "unrecognized case status: '{}' \ + (must be one of C, F, S or T)", + s + ), + } + } +} + +#[cfg(test)] +mod tests { + use super::{CaseFold, CaseStatus}; + + #[test] + fn parse_common() { + let line = + "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0150); + assert_eq!(row.status, CaseStatus::Common); + assert_eq!(row.mapping, vec![0x0151]); + } + + #[test] + fn parse_full() { + let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x03B0); + assert_eq!(row.status, CaseStatus::Full); + assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]); + } + + #[test] + fn parse_simple() { + let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x1F8F); + assert_eq!(row.status, CaseStatus::Simple); + assert_eq!(row.mapping, vec![0x1F87]); + } + + #[test] + fn parse_special() { + let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0049); + assert_eq!(row.status, CaseStatus::Special); + assert_eq!(row.mapping, vec![0x0131]); + } +} diff --git a/vendor/ucd-parse/src/common.rs b/vendor/ucd-parse/src/common.rs new file mode 100644 index 000000000..c18be668e --- /dev/null +++ b/vendor/ucd-parse/src/common.rs @@ -0,0 +1,594 @@ +use std::char; +use std::collections::BTreeMap; +use std::fmt; +use std::fs::File; +use std::io::{self, BufRead}; +use std::marker::PhantomData; +use std::path::{Path, PathBuf}; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::error::{Error, ErrorKind}; + +/// Parse a particular file in the UCD into a sequence of rows. +/// +/// The given directory should be the directory to the UCD. +pub fn parse(ucd_dir: P) -> Result, Error> +where + P: AsRef, + D: UcdFile, +{ + let mut xs = vec![]; + for result in D::from_dir(ucd_dir)? { + let x = result?; + xs.push(x); + } + Ok(xs) +} + +/// Parse a particular file in the UCD into a map from codepoint to the record. +/// +/// The given directory should be the directory to the UCD. +pub fn parse_by_codepoint( + ucd_dir: P, +) -> Result, Error> +where + P: AsRef, + D: UcdFileByCodepoint, +{ + let mut map = BTreeMap::new(); + for result in D::from_dir(ucd_dir)? { + let x = result?; + for cp in x.codepoints() { + map.insert(cp, x.clone()); + } + } + Ok(map) +} + +/// Parse a particular file in the UCD into a map from codepoint to all +/// records associated with that codepoint. +/// +/// This is useful for files that have multiple records for each codepoint. +/// For example, the `NameAliases.txt` file lists multiple aliases for some +/// codepoints. +/// +/// The given directory should be the directory to the UCD. +pub fn parse_many_by_codepoint( + ucd_dir: P, +) -> Result>, Error> +where + P: AsRef, + D: UcdFileByCodepoint, +{ + let mut map = BTreeMap::new(); + for result in D::from_dir(ucd_dir)? { + let x = result?; + for cp in x.codepoints() { + map.entry(cp).or_insert(vec![]).push(x.clone()); + } + } + Ok(map) +} + +/// Given a path pointing at the root of the `ucd_dir`, attempts to determine +/// it's unicode version. +/// +/// This just checks the readme and the very first line of PropList.txt -- in +/// practice this works for all versions of UCD since 4.1.0. +pub fn ucd_directory_version>( + ucd_dir: &D, +) -> Result<(u64, u64, u64), Error> { + // Avoid duplication from generic path parameter. + fn ucd_directory_version_inner( + ucd_dir: &Path, + ) -> Result<(u64, u64, u64), Error> { + lazy_static::lazy_static! { + static ref VERSION_RX: Regex = + Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap(); + } + + let proplist = ucd_dir.join("PropList.txt"); + let contents = first_line(&proplist)?; + let caps = match VERSION_RX.captures(&contents) { + Some(c) => c, + None => { + return err!("Failed to find version in line {:?}", contents) + } + }; + + let capture_to_num = |n| { + caps.get(n).unwrap().as_str().parse::().map_err(|e| Error { + kind: ErrorKind::Parse(format!( + "Failed to parse version from {:?} in PropList.txt: {}", + contents, e + )), + line: Some(0), + path: Some(proplist.clone()), + }) + }; + let major = capture_to_num(1)?; + let minor = capture_to_num(2)?; + let patch = capture_to_num(3)?; + + Ok((major, minor, patch)) + } + ucd_directory_version_inner(ucd_dir.as_ref()) +} + +fn first_line(path: &Path) -> Result { + let file = std::fs::File::open(path).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.into()), + })?; + + let mut reader = std::io::BufReader::new(file); + let mut line_contents = String::new(); + reader.read_line(&mut line_contents).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.into()), + })?; + Ok(line_contents) +} + +/// A helper function for parsing a common record format that associates one +/// or more codepoints with a string value. +pub fn parse_codepoint_association<'a>( + line: &'a str, +) -> Result<(Codepoints, &'a str), Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P[^\s;]+)\s*; + \s*(?P[^;\x23]+)\s* + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid PropList line: '{}'", line), + }; + let property = match caps.name("property") { + Some(property) => property.as_str().trim(), + None => { + return err!( + "could not find property name in PropList line: '{}'", + line + ) + } + }; + Ok((caps["codepoints"].parse()?, property)) +} + +/// A helper function for parsing a sequence of space separated codepoints. +/// The sequence is permitted to be empty. +pub fn parse_codepoint_sequence(s: &str) -> Result, Error> { + let mut cps = vec![]; + for cp in s.trim().split_whitespace() { + cps.push(cp.parse()?); + } + Ok(cps) +} + +/// A helper function for parsing a single test for the various break +/// algorithms. +/// +/// Upon success, this returns the UTF-8 encoded groups of codepoints along +/// with the comment associated with the test. The comment is a human readable +/// description of the test that may prove useful for debugging. +pub fn parse_break_test(line: &str) -> Result<(Vec, String), Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?:÷|×) + (?P(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+) + \s+ + \#(?P.+) + $ + " + ) + .unwrap(); + static ref GROUP: Regex = Regex::new( + r"(?x) + (?P[0-9A-Fa-f]{4,5})\s(?P÷|×) + " + ) + .unwrap(); + } + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid break test line: '{}'", line), + }; + let comment = caps["comment"].trim().to_string(); + + let mut groups = vec![]; + let mut cur = String::new(); + for cap in GROUP.captures_iter(&caps["groups"]) { + let cp: Codepoint = cap["codepoint"].parse()?; + let ch = match cp.scalar() { + Some(ch) => ch, + None => { + return err!( + "invalid codepoint '{:X}' in line: '{}'", + cp.value(), + line + ) + } + }; + cur.push(ch); + if &cap["kind"] == "÷" { + groups.push(cur); + cur = String::new(); + } + } + Ok((groups, comment)) +} + +/// Describes a single UCD file. +pub trait UcdFile: + Clone + fmt::Debug + Default + Eq + FromStr + PartialEq +{ + /// The file path corresponding to this file, relative to the UCD + /// directory. + fn relative_file_path() -> &'static Path; + + /// The full file path corresponding to this file given the UCD directory + /// path. + fn file_path>(ucd_dir: P) -> PathBuf { + ucd_dir.as_ref().join(Self::relative_file_path()) + } + + /// Create an iterator over each record in this UCD file. + /// + /// The parameter should correspond to the directory containing the UCD. + fn from_dir>( + ucd_dir: P, + ) -> Result, Error> { + UcdLineParser::from_path(Self::file_path(ucd_dir)) + } +} + +/// Describes a single UCD file where every record in the file is associated +/// with one or more codepoints. +pub trait UcdFileByCodepoint: UcdFile { + /// Returns the codepoints associated with this record. + fn codepoints(&self) -> CodepointIter; +} + +/// A line oriented parser for a particular UCD file. +/// +/// Callers can build a line parser via the +/// [`UcdFile::from_dir`](trait.UcdFile.html) method. +/// +/// The `R` type parameter refers to the underlying `io::Read` implementation +/// from which the UCD data is read. +/// +/// The `D` type parameter refers to the type of the record parsed out of each +/// line. +#[derive(Debug)] +pub struct UcdLineParser { + path: Option, + rdr: io::BufReader, + line: String, + line_number: u64, + _data: PhantomData, +} + +impl UcdLineParser { + /// Create a new parser from the given file path. + pub(crate) fn from_path>( + path: P, + ) -> Result, Error> { + let path = path.as_ref(); + let file = File::open(path).map_err(|e| Error { + kind: ErrorKind::Io(e), + line: None, + path: Some(path.to_path_buf()), + })?; + Ok(UcdLineParser::new(Some(path.to_path_buf()), file)) + } +} + +impl UcdLineParser { + /// Create a new parser that parses the reader given. + /// + /// The type of data parsed is determined when the `parse_next` function + /// is called by virtue of the type requested. + /// + /// Note that the reader is buffered internally, so the caller does not + /// need to provide their own buffering. + pub(crate) fn new(path: Option, rdr: R) -> UcdLineParser { + UcdLineParser { + path, + rdr: io::BufReader::new(rdr), + line: String::new(), + line_number: 0, + _data: PhantomData, + } + } +} + +impl> Iterator for UcdLineParser { + type Item = Result; + + fn next(&mut self) -> Option> { + loop { + self.line_number += 1; + self.line.clear(); + let n = match self.rdr.read_line(&mut self.line) { + Err(err) => { + return Some(Err(Error { + kind: ErrorKind::Io(err), + line: None, + path: self.path.clone(), + })) + } + Ok(n) => n, + }; + if n == 0 { + return None; + } + if !self.line.starts_with('#') && !self.line.trim().is_empty() { + break; + } + } + let line_number = self.line_number; + Some(self.line.parse().map_err(|mut err: Error| { + err.line = Some(line_number); + err + })) + } +} + +/// A representation of either a single codepoint or a range of codepoints. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub enum Codepoints { + /// A single codepoint. + Single(Codepoint), + /// A range of codepoints. + Range(CodepointRange), +} + +impl Default for Codepoints { + fn default() -> Codepoints { + Codepoints::Single(Codepoint::default()) + } +} + +impl IntoIterator for Codepoints { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + match self { + Codepoints::Single(x) => x.into_iter(), + Codepoints::Range(x) => x.into_iter(), + } + } +} + +impl FromStr for Codepoints { + type Err = Error; + + fn from_str(s: &str) -> Result { + if s.contains("..") { + CodepointRange::from_str(s).map(Codepoints::Range) + } else { + Codepoint::from_str(s).map(Codepoints::Single) + } + } +} + +impl fmt::Display for Codepoints { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Codepoints::Single(ref x) => x.fmt(f), + Codepoints::Range(ref x) => x.fmt(f), + } + } +} + +impl PartialEq for Codepoints { + fn eq(&self, other: &u32) -> bool { + match *self { + Codepoints::Single(ref x) => x == other, + Codepoints::Range(ref x) => x == &(*other, *other), + } + } +} + +impl PartialEq for Codepoints { + fn eq(&self, other: &Codepoint) -> bool { + match *self { + Codepoints::Single(ref x) => x == other, + Codepoints::Range(ref x) => x == &(*other, *other), + } + } +} + +impl PartialEq<(u32, u32)> for Codepoints { + fn eq(&self, other: &(u32, u32)) -> bool { + match *self { + Codepoints::Single(ref x) => &(x.value(), x.value()) == other, + Codepoints::Range(ref x) => x == other, + } + } +} + +impl PartialEq<(Codepoint, Codepoint)> for Codepoints { + fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { + match *self { + Codepoints::Single(ref x) => &(*x, *x) == other, + Codepoints::Range(ref x) => x == other, + } + } +} + +/// A range of Unicode codepoints. The range is inclusive; both ends of the +/// range are guaranteed to be valid codepoints. +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub struct CodepointRange { + /// The start of the codepoint range. + pub start: Codepoint, + /// The end of the codepoint range. + pub end: Codepoint, +} + +impl IntoIterator for CodepointRange { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + CodepointIter { next: self.start.value(), range: self } + } +} + +impl FromStr for CodepointRange { + type Err = Error; + + fn from_str(s: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = + Regex::new(r"^(?P[A-Z0-9]+)\.\.(?P[A-Z0-9]+)$") + .unwrap(); + } + let caps = match PARTS.captures(s) { + Some(caps) => caps, + None => return err!("invalid codepoint range: '{}'", s), + }; + let start = caps["start"].parse().or_else(|err| { + err!("failed to parse '{}' as a codepoint range: {}", s, err) + })?; + let end = caps["end"].parse().or_else(|err| { + err!("failed to parse '{}' as a codepoint range: {}", s, err) + })?; + Ok(CodepointRange { start, end }) + } +} + +impl fmt::Display for CodepointRange { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}..{}", self.start, self.end) + } +} + +impl PartialEq<(u32, u32)> for CodepointRange { + fn eq(&self, other: &(u32, u32)) -> bool { + &(self.start.value(), self.end.value()) == other + } +} + +impl PartialEq<(Codepoint, Codepoint)> for CodepointRange { + fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { + &(self.start, self.end) == other + } +} + +/// A single Unicode codepoint. +/// +/// This type's string representation is a hexadecimal number. It is guaranteed +/// to be in the range `[0, 10FFFF]`. +/// +/// Note that unlike Rust's `char` type, this may be a surrogate codepoint. +#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub struct Codepoint(u32); + +impl Codepoint { + /// Create a new codepoint from a `u32`. + /// + /// If the given number is not a valid codepoint, then this returns an + /// error. + pub fn from_u32(n: u32) -> Result { + if n > 0x10FFFF { + err!("{:x} is not a valid Unicode codepoint", n) + } else { + Ok(Codepoint(n)) + } + } + + /// Return the underlying `u32` codepoint value. + pub fn value(self) -> u32 { + self.0 + } + + /// Attempt to convert this codepoint to a Unicode scalar value. + /// + /// If this is a surrogate codepoint, then this returns `None`. + pub fn scalar(self) -> Option { + char::from_u32(self.0) + } +} + +impl IntoIterator for Codepoint { + type IntoIter = CodepointIter; + type Item = Codepoint; + + fn into_iter(self) -> CodepointIter { + let range = CodepointRange { start: self, end: self }; + CodepointIter { next: self.value(), range } + } +} + +impl FromStr for Codepoint { + type Err = Error; + + fn from_str(s: &str) -> Result { + match u32::from_str_radix(s, 16) { + Ok(n) => Codepoint::from_u32(n), + Err(err) => { + return err!( + "failed to parse '{}' as a hexadecimal codepoint: {}", + s, + err + ); + } + } + } +} + +impl fmt::Display for Codepoint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:04X}", self.0) + } +} + +impl PartialEq for Codepoint { + fn eq(&self, other: &u32) -> bool { + self.0 == *other + } +} + +impl PartialEq for u32 { + fn eq(&self, other: &Codepoint) -> bool { + *self == other.0 + } +} + +/// An iterator over a range of Unicode codepoints. +#[derive(Debug)] +pub struct CodepointIter { + next: u32, + range: CodepointRange, +} + +impl Iterator for CodepointIter { + type Item = Codepoint; + + fn next(&mut self) -> Option { + if self.next > self.range.end.value() { + return None; + } + let current = self.next; + self.next += 1; + Some(Codepoint::from_u32(current).unwrap()) + } +} diff --git a/vendor/ucd-parse/src/core_properties.rs b/vendor/ucd-parse/src/core_properties.rs new file mode 100644 index 000000000..9a7682b43 --- /dev/null +++ b/vendor/ucd-parse/src/core_properties.rs @@ -0,0 +1,60 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `DerivedCoreProperties.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct CoreProperty { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property name assigned to the codepoints in this entry. + pub property: String, +} + +impl UcdFile for CoreProperty { + fn relative_file_path() -> &'static Path { + Path::new("DerivedCoreProperties.txt") + } +} + +impl UcdFileByCodepoint for CoreProperty { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for CoreProperty { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, property) = parse_codepoint_association(line)?; + Ok(CoreProperty { codepoints, property: property.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::CoreProperty; + + #[test] + fn parse_single() { + let line = + "1163D ; Case_Ignorable # Mn MODI SIGN ANUSVARA\n"; + let row: CoreProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x1163D); + assert_eq!(row.property, "Case_Ignorable"); + } + + #[test] + fn parse_range() { + let line = "11133..11134 ; Grapheme_Link # Mn [2] CHAKMA VIRAMA..CHAKMA MAAYYAA\n"; + let row: CoreProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x11133, 0x11134)); + assert_eq!(row.property, "Grapheme_Link"); + } +} diff --git a/vendor/ucd-parse/src/emoji_properties.rs b/vendor/ucd-parse/src/emoji_properties.rs new file mode 100644 index 000000000..dc5c0c884 --- /dev/null +++ b/vendor/ucd-parse/src/emoji_properties.rs @@ -0,0 +1,86 @@ +use std::path::{Path, PathBuf}; +use std::str::FromStr; + +use crate::common::{ + parse_codepoint_association, CodepointIter, Codepoints, UcdFile, + UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `emoji-data.txt` file. +/// +/// The `emoji-data.txt` file is the source of truth on several Emoji-related +/// Unicode properties. +/// +/// Note that `emoji-data.txt` is not formally part of the Unicode Character +/// Database. You can download the Emoji data files separately here: +/// https://unicode.org/Public/emoji/ +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct EmojiProperty { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property name assigned to the codepoints in this entry. + pub property: String, +} + +impl UcdFile for EmojiProperty { + fn relative_file_path() -> &'static Path { + Path::new("emoji/emoji-data.txt") + } + + fn file_path>(ucd_dir: P) -> PathBuf { + let ucd_dir = ucd_dir.as_ref(); + // The standard location, but only on UCDs from 13.0.0 and up. + let std = ucd_dir.join(Self::relative_file_path()); + if std.exists() { + std + } else { + // If the old location does exist, use it. + let legacy = ucd_dir.join("emoji-data.txt"); + if legacy.exists() { + legacy + } else { + // This might end up in an error message, so use the standard + // one if forced to choose. Arguably we could do something like + // peek + std + } + } + } +} + +impl UcdFileByCodepoint for EmojiProperty { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for EmojiProperty { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, property) = parse_codepoint_association(line)?; + Ok(EmojiProperty { codepoints, property: property.to_string() }) + } +} + +#[cfg(test)] +mod tests { + use super::EmojiProperty; + + #[test] + fn parse_single() { + let line = "24C2 ; Emoji # 1.1 [1] (Ⓜ️) circled M\n"; + let row: EmojiProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x24C2); + assert_eq!(row.property, "Emoji"); + } + + #[test] + fn parse_range() { + let line = "1FA6E..1FFFD ; Extended_Pictographic# NA[1424] (🩮️..🿽️) ..\n"; + let row: EmojiProperty = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x1FA6E, 0x1FFFD)); + assert_eq!(row.property, "Extended_Pictographic"); + } +} diff --git a/vendor/ucd-parse/src/error.rs b/vendor/ucd-parse/src/error.rs new file mode 100644 index 000000000..9dafc4b33 --- /dev/null +++ b/vendor/ucd-parse/src/error.rs @@ -0,0 +1,86 @@ +use std::error; +use std::fmt; +use std::io; +use std::path::{Path, PathBuf}; + +/// Represents any kind of error that can occur while parsing the UCD. +#[derive(Debug)] +pub struct Error { + pub(crate) kind: ErrorKind, + pub(crate) line: Option, + pub(crate) path: Option, +} + +/// The kind of error that occurred while parsing the UCD. +#[derive(Debug)] +pub enum ErrorKind { + /// An I/O error. + Io(io::Error), + /// A generic parse error. + Parse(String), +} + +impl Error { + /// Create a new parse error from the given message. + pub(crate) fn parse(msg: String) -> Error { + Error { kind: ErrorKind::Parse(msg), line: None, path: None } + } + + /// Return the specific kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } + + /// Return the line number at which this error occurred, if available. + pub fn line(&self) -> Option { + self.line + } + + /// Return the file path associated with this error, if one exists. + pub fn path(&self) -> Option<&Path> { + self.path.as_ref().map(|p| &**p) + } + + /// Unwrap this error into its underlying kind. + pub fn into_kind(self) -> ErrorKind { + self.kind + } + + /// Returns true if and only if this is an I/O error. + /// + /// If this returns true, the underlying `ErrorKind` is guaranteed to be + /// `ErrorKind::Io`. + pub fn is_io_error(&self) -> bool { + match self.kind { + ErrorKind::Io(_) => true, + _ => false, + } + } +} + +impl error::Error for Error { + fn cause(&self) -> Option<&dyn error::Error> { + match self.kind { + ErrorKind::Io(ref err) => Some(err), + _ => None, + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(ref path) = self.path { + if let Some(line) = self.line { + write!(f, "{}:{}: ", path.display(), line)?; + } else { + write!(f, "{}: ", path.display())?; + } + } else if let Some(line) = self.line { + write!(f, "error on line {}: ", line)?; + } + match self.kind { + ErrorKind::Io(ref err) => write!(f, "{}", err), + ErrorKind::Parse(ref msg) => write!(f, "{}", msg), + } + } +} diff --git a/vendor/ucd-parse/src/grapheme_cluster_break.rs b/vendor/ucd-parse/src/grapheme_cluster_break.rs new file mode 100644 index 000000000..9dbf32f41 --- /dev/null +++ b/vendor/ucd-parse/src/grapheme_cluster_break.rs @@ -0,0 +1,98 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{ + parse_break_test, parse_codepoint_association, CodepointIter, Codepoints, + UcdFile, UcdFileByCodepoint, +}; +use crate::error::Error; + +/// A single row in the `auxiliary/GraphemeBreakProperty.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct GraphemeClusterBreak { + /// The codepoint or codepoint range for this entry. + pub codepoints: Codepoints, + /// The property value assigned to the codepoints in this entry. + pub value: String, +} + +impl UcdFile for GraphemeClusterBreak { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/GraphemeBreakProperty.txt") + } +} + +impl UcdFileByCodepoint for GraphemeClusterBreak { + fn codepoints(&self) -> CodepointIter { + self.codepoints.into_iter() + } +} + +impl FromStr for GraphemeClusterBreak { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (codepoints, value) = parse_codepoint_association(line)?; + Ok(GraphemeClusterBreak { codepoints, value: value.to_string() }) + } +} + +/// A single row in the `auxiliary/GraphemeBreakTest.txt` file. +/// +/// This file defines tests for the grapheme cluster break algorithm. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct GraphemeClusterBreakTest { + /// Each string is a UTF-8 encoded group of codepoints that make up a + /// single grapheme cluster. + pub grapheme_clusters: Vec, + /// A human readable description of this test. + pub comment: String, +} + +impl UcdFile for GraphemeClusterBreakTest { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/GraphemeBreakTest.txt") + } +} + +impl FromStr for GraphemeClusterBreakTest { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (groups, comment) = parse_break_test(line)?; + Ok(GraphemeClusterBreakTest { grapheme_clusters: groups, comment }) + } +} + +#[cfg(test)] +mod tests { + use super::{GraphemeClusterBreak, GraphemeClusterBreakTest}; + + #[test] + fn parse_single() { + let line = "093B ; SpacingMark # Mc DEVANAGARI VOWEL SIGN OOE\n"; + let row: GraphemeClusterBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, 0x093B); + assert_eq!(row.value, "SpacingMark"); + } + + #[test] + fn parse_range() { + let line = "1F1E6..1F1FF ; Regional_Indicator # So [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z\n"; + let row: GraphemeClusterBreak = line.parse().unwrap(); + assert_eq!(row.codepoints, (0x1F1E6, 0x1F1FF)); + assert_eq!(row.value, "Regional_Indicator"); + } + + #[test] + fn parse_test() { + let line = "÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]\n"; + + let row: GraphemeClusterBreakTest = line.parse().unwrap(); + assert_eq!( + row.grapheme_clusters, + vec!["\u{0061}\u{1F3FF}", "\u{1F476}\u{200D}\u{1F6D1}",] + ); + assert!(row.comment.starts_with("÷ [0.2] LATIN SMALL LETTER A")); + } +} diff --git a/vendor/ucd-parse/src/jamo_short_name.rs b/vendor/ucd-parse/src/jamo_short_name.rs new file mode 100644 index 000000000..4103dd7ee --- /dev/null +++ b/vendor/ucd-parse/src/jamo_short_name.rs @@ -0,0 +1,80 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `Jamo.txt` file. +/// +/// The `Jamo.txt` file defines the `Jamo_Short_Name` property. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct JamoShortName { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The actual "Jamo Short Name." This string contains at most 3 bytes and + /// may be empty. + pub name: String, +} + +impl UcdFile for JamoShortName { + fn relative_file_path() -> &'static Path { + Path::new("Jamo.txt") + } +} + +impl UcdFileByCodepoint for JamoShortName { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for JamoShortName { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?P[A-Z0-9]+); + \s* + (?P[A-Z]*) + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid Jamo_Short_name line"), + }; + Ok(JamoShortName { + codepoint: caps["codepoint"].parse()?, + name: caps.name("name").unwrap().as_str().to_string(), + }) + } +} + +#[cfg(test)] +mod tests { + use super::JamoShortName; + + #[test] + fn parse1() { + let line = "1164; YAE # HANGUL JUNGSEONG YAE\n"; + let row: JamoShortName = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x1164); + assert_eq!(row.name, "YAE"); + } + + #[test] + fn parse2() { + let line = "110B; # HANGUL CHOSEONG IEUNG\n"; + let row: JamoShortName = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x110B); + assert_eq!(row.name, ""); + } +} diff --git a/vendor/ucd-parse/src/lib.rs b/vendor/ucd-parse/src/lib.rs new file mode 100644 index 000000000..f6654658a --- /dev/null +++ b/vendor/ucd-parse/src/lib.rs @@ -0,0 +1,66 @@ +/*! +A library for parsing the Unicode character database. +*/ + +#![deny(missing_docs)] + +pub use crate::common::{ + parse, parse_by_codepoint, parse_many_by_codepoint, ucd_directory_version, + Codepoint, CodepointIter, CodepointRange, Codepoints, UcdFile, + UcdFileByCodepoint, UcdLineParser, +}; +pub use crate::error::{Error, ErrorKind}; + +pub use crate::age::Age; +pub use crate::arabic_shaping::ArabicShaping; +pub use crate::bidi_mirroring_glyph::BidiMirroring; +pub use crate::case_folding::{CaseFold, CaseStatus}; +pub use crate::core_properties::CoreProperty; +pub use crate::emoji_properties::EmojiProperty; +pub use crate::grapheme_cluster_break::{ + GraphemeClusterBreak, GraphemeClusterBreakTest, +}; +pub use crate::jamo_short_name::JamoShortName; +pub use crate::line_break::LineBreakTest; +pub use crate::name_aliases::{NameAlias, NameAliasLabel}; +pub use crate::prop_list::Property; +pub use crate::property_aliases::PropertyAlias; +pub use crate::property_value_aliases::PropertyValueAlias; +pub use crate::script_extensions::ScriptExtension; +pub use crate::scripts::Script; +pub use crate::sentence_break::{SentenceBreak, SentenceBreakTest}; +pub use crate::special_casing::SpecialCaseMapping; +pub use crate::unicode_data::{ + UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, + UnicodeDataExpander, UnicodeDataNumeric, +}; +pub use crate::word_break::{WordBreak, WordBreakTest}; + +macro_rules! err { + ($($tt:tt)*) => { + Err(crate::error::Error::parse(format!($($tt)*))) + } +} + +mod common; +mod error; + +mod age; +mod arabic_shaping; +mod bidi_mirroring_glyph; +mod case_folding; +mod core_properties; +mod emoji_properties; +mod grapheme_cluster_break; +mod jamo_short_name; +mod line_break; +mod name_aliases; +mod prop_list; +mod property_aliases; +mod property_value_aliases; +mod script_extensions; +mod scripts; +mod sentence_break; +mod special_casing; +mod unicode_data; +mod word_break; diff --git a/vendor/ucd-parse/src/line_break.rs b/vendor/ucd-parse/src/line_break.rs new file mode 100644 index 000000000..aa62fcb9e --- /dev/null +++ b/vendor/ucd-parse/src/line_break.rs @@ -0,0 +1,49 @@ +use std::path::Path; +use std::str::FromStr; + +use crate::common::{parse_break_test, UcdFile}; +use crate::error::Error; + +/// A single row in the `auxiliary/LineBreakTest.txt` file. +/// +/// This file defines tests for the line break algorithm. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct LineBreakTest { + /// Each string is a UTF-8 encoded group of codepoints that make up a + /// single line. + pub lines: Vec, + /// A human readable description of this test. + pub comment: String, +} + +impl UcdFile for LineBreakTest { + fn relative_file_path() -> &'static Path { + Path::new("auxiliary/LineBreakTest.txt") + } +} + +impl FromStr for LineBreakTest { + type Err = Error; + + fn from_str(line: &str) -> Result { + let (groups, comment) = parse_break_test(line)?; + Ok(LineBreakTest { lines: groups, comment }) + } +} + +#[cfg(test)] +mod tests { + use super::LineBreakTest; + + #[test] + fn parse_test() { + let line = "× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷ # × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [30.13] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]"; + + let row: LineBreakTest = line.parse().unwrap(); + assert_eq!( + row.lines, + vec!["\u{1F1F7}\u{1F1FA}", "\u{1F1F8}\u{1F1EA}",] + ); + assert!(row.comment.ends_with("(RI) ÷ [0.3]")); + } +} diff --git a/vendor/ucd-parse/src/name_aliases.rs b/vendor/ucd-parse/src/name_aliases.rs new file mode 100644 index 000000000..36c9c4b01 --- /dev/null +++ b/vendor/ucd-parse/src/name_aliases.rs @@ -0,0 +1,145 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `NameAliases.txt` file. +/// +/// Note that there are multiple rows for some codepoint. Each row provides a +/// new alias. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct NameAlias { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The alias. + pub alias: String, + /// The label of this alias. + pub label: NameAliasLabel, +} + +impl UcdFile for NameAlias { + fn relative_file_path() -> &'static Path { + Path::new("NameAliases.txt") + } +} + +impl UcdFileByCodepoint for NameAlias { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for NameAlias { + type Err = Error; + + fn from_str(line: &str) -> Result { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + (?P[A-Z0-9]+); + \s* + (?P[^;]+); + \s* + (?P