diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:02:58 +0000 |
commit | 698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch) | |
tree | 173a775858bd501c378080a10dca74132f05bc50 /vendor/ucd-parse/src/case_folding.rs | |
parent | Initial commit. (diff) | |
download | rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip |
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/ucd-parse/src/case_folding.rs')
-rw-r--r-- | vendor/ucd-parse/src/case_folding.rs | 161 |
1 files changed, 161 insertions, 0 deletions
diff --git a/vendor/ucd-parse/src/case_folding.rs b/vendor/ucd-parse/src/case_folding.rs new file mode 100644 index 000000000..813fc81a1 --- /dev/null +++ b/vendor/ucd-parse/src/case_folding.rs @@ -0,0 +1,161 @@ +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// A single row in the `CaseFolding.txt` file. +/// +/// The contents of `CaseFolding.txt` are a convenience derived from both +/// `UnicodeData.txt` and `SpecialCasing.txt`. +/// +/// Note that a single codepoint may be mapped multiple times. In particular, +/// a single codepoint might have distinct `CaseStatus::Simple` and +/// `CaseStatus::Full` mappings. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct CaseFold { + /// The codepoint that is being mapped. + pub codepoint: Codepoint, + /// The case status of this mapping. + pub status: CaseStatus, + /// The actual case mapping, which is more than one codepoint if this is + /// a "full" mapping. + pub mapping: Vec<Codepoint>, +} + +impl UcdFile for CaseFold { + fn relative_file_path() -> &'static Path { + Path::new("CaseFolding.txt") + } +} + +impl UcdFileByCodepoint for CaseFold { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl FromStr for CaseFold { + type Err = Error; + + fn from_str(line: &str) -> Result<CaseFold, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + \s*(?P<codepoint>[^\s;]+)\s*; + \s*(?P<status>[^\s;]+)\s*; + \s*(?P<mapping>[^;]+)\s*; + " + ) + .unwrap(); + }; + + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid CaseFolding line: '{}'", line), + }; + let mut mapping = vec![]; + for cp in caps["mapping"].split_whitespace() { + mapping.push(cp.parse()?); + } + Ok(CaseFold { + codepoint: caps["codepoint"].parse()?, + status: caps["status"].parse()?, + mapping, + }) + } +} + +/// The status of a particular case mapping. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum CaseStatus { + /// Case mappings shared by both "simple" and "full" mappings. + Common, + /// A case mapping that changes the number of codepoints. + Full, + /// A case mapping that doesn't change the number of codepoints, when it + /// differs from `Full`. + Simple, + /// Special cases (currently only for Turkic mappings) that are typically + /// excluded by default. Special cases don't change the number of + /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes. + Special, +} + +impl Default for CaseStatus { + fn default() -> CaseStatus { + CaseStatus::Common + } +} + +impl CaseStatus { + /// Returns true if and only if this status indicates a case mapping that + /// won't change the number of codepoints. + pub fn is_fixed(&self) -> bool { + *self != CaseStatus::Full + } +} + +impl FromStr for CaseStatus { + type Err = Error; + + fn from_str(s: &str) -> Result<CaseStatus, Error> { + match s { + "C" => Ok(CaseStatus::Common), + "F" => Ok(CaseStatus::Full), + "S" => Ok(CaseStatus::Simple), + "T" => Ok(CaseStatus::Special), + _ => err!( + "unrecognized case status: '{}' \ + (must be one of C, F, S or T)", + s + ), + } + } +} + +#[cfg(test)] +mod tests { + use super::{CaseFold, CaseStatus}; + + #[test] + fn parse_common() { + let line = + "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0150); + assert_eq!(row.status, CaseStatus::Common); + assert_eq!(row.mapping, vec![0x0151]); + } + + #[test] + fn parse_full() { + let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x03B0); + assert_eq!(row.status, CaseStatus::Full); + assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]); + } + + #[test] + fn parse_simple() { + let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x1F8F); + assert_eq!(row.status, CaseStatus::Simple); + assert_eq!(row.mapping, vec![0x1F87]); + } + + #[test] + fn parse_special() { + let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n"; + let row: CaseFold = line.parse().unwrap(); + assert_eq!(row.codepoint, 0x0049); + assert_eq!(row.status, CaseStatus::Special); + assert_eq!(row.mapping, vec![0x0131]); + } +} |