use std::char; use std::collections::BTreeMap; use std::fmt; use std::fs::File; use std::io::{self, BufRead}; use std::marker::PhantomData; use std::path::{Path, PathBuf}; use std::str::FromStr; use once_cell::sync::Lazy; use regex::Regex; use crate::error::{Error, ErrorKind}; /// Parse a particular file in the UCD into a sequence of rows. /// /// The given directory should be the directory to the UCD. pub fn parse(ucd_dir: P) -> Result, Error> where P: AsRef, D: UcdFile, { let mut xs = vec![]; for result in D::from_dir(ucd_dir)? { let x = result?; xs.push(x); } Ok(xs) } /// Parse a particular file in the UCD into a map from codepoint to the record. /// /// The given directory should be the directory to the UCD. pub fn parse_by_codepoint( ucd_dir: P, ) -> Result, Error> where P: AsRef, D: UcdFileByCodepoint, { let mut map = BTreeMap::new(); for result in D::from_dir(ucd_dir)? { let x = result?; for cp in x.codepoints() { map.insert(cp, x.clone()); } } Ok(map) } /// Parse a particular file in the UCD into a map from codepoint to all /// records associated with that codepoint. /// /// This is useful for files that have multiple records for each codepoint. /// For example, the `NameAliases.txt` file lists multiple aliases for some /// codepoints. /// /// The given directory should be the directory to the UCD. pub fn parse_many_by_codepoint( ucd_dir: P, ) -> Result>, Error> where P: AsRef, D: UcdFileByCodepoint, { let mut map = BTreeMap::new(); for result in D::from_dir(ucd_dir)? { let x = result?; for cp in x.codepoints() { map.entry(cp).or_insert(vec![]).push(x.clone()); } } Ok(map) } /// Given a path pointing at the root of the `ucd_dir`, attempts to determine /// it's unicode version. /// /// This just checks the readme and the very first line of PropList.txt -- in /// practice this works for all versions of UCD since 4.1.0. pub fn ucd_directory_version>( ucd_dir: &D, ) -> Result<(u64, u64, u64), Error> { // Avoid duplication from generic path parameter. fn ucd_directory_version_inner( ucd_dir: &Path, ) -> Result<(u64, u64, u64), Error> { static VERSION_RX: Lazy = Lazy::new(|| { Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap() }); let proplist = ucd_dir.join("PropList.txt"); let contents = first_line(&proplist)?; let caps = match VERSION_RX.captures(&contents) { Some(c) => c, None => { return err!("Failed to find version in line {:?}", contents) } }; let capture_to_num = |n| { caps.get(n).unwrap().as_str().parse::().map_err(|e| Error { kind: ErrorKind::Parse(format!( "Failed to parse version from {:?} in PropList.txt: {}", contents, e )), line: Some(0), path: Some(proplist.clone()), }) }; let major = capture_to_num(1)?; let minor = capture_to_num(2)?; let patch = capture_to_num(3)?; Ok((major, minor, patch)) } ucd_directory_version_inner(ucd_dir.as_ref()) } fn first_line(path: &Path) -> Result { let file = std::fs::File::open(path).map_err(|e| Error { kind: ErrorKind::Io(e), line: None, path: Some(path.into()), })?; let mut reader = std::io::BufReader::new(file); let mut line_contents = String::new(); reader.read_line(&mut line_contents).map_err(|e| Error { kind: ErrorKind::Io(e), line: None, path: Some(path.into()), })?; Ok(line_contents) } /// A helper function for parsing a common record format that associates one /// or more codepoints with a string value. pub fn parse_codepoint_association<'a>( line: &'a str, ) -> Result<(Codepoints, &'a str), Error> { static PARTS: Lazy = Lazy::new(|| { Regex::new( r"(?x) ^ \s*(?P[^\s;]+)\s*; \s*(?P[^;\x23]+)\s* ", ) .unwrap() }); let caps = match PARTS.captures(line.trim()) { Some(caps) => caps, None => return err!("invalid PropList line: '{}'", line), }; let property = match caps.name("property") { Some(property) => property.as_str().trim(), None => { return err!( "could not find property name in PropList line: '{}'", line ) } }; Ok((caps["codepoints"].parse()?, property)) } /// A helper function for parsing a sequence of space separated codepoints. /// The sequence is permitted to be empty. pub fn parse_codepoint_sequence(s: &str) -> Result, Error> { let mut cps = vec![]; for cp in s.trim().split_whitespace() { cps.push(cp.parse()?); } Ok(cps) } /// A helper function for parsing a single test for the various break /// algorithms. /// /// Upon success, this returns the UTF-8 encoded groups of codepoints along /// with the comment associated with the test. The comment is a human readable /// description of the test that may prove useful for debugging. pub fn parse_break_test(line: &str) -> Result<(Vec, String), Error> { static PARTS: Lazy = Lazy::new(|| { Regex::new( r"(?x) ^ (?:÷|×) (?P(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+) \s+ \#(?P.+) $ ", ) .unwrap() }); static GROUP: Lazy = Lazy::new(|| { Regex::new( r"(?x) (?P[0-9A-Fa-f]{4,5})\s(?P÷|×) ", ) .unwrap() }); let caps = match PARTS.captures(line.trim()) { Some(caps) => caps, None => return err!("invalid break test line: '{}'", line), }; let comment = caps["comment"].trim().to_string(); let mut groups = vec![]; let mut cur = String::new(); for cap in GROUP.captures_iter(&caps["groups"]) { let cp: Codepoint = cap["codepoint"].parse()?; let ch = match cp.scalar() { Some(ch) => ch, None => { return err!( "invalid codepoint '{:X}' in line: '{}'", cp.value(), line ) } }; cur.push(ch); if &cap["kind"] == "÷" { groups.push(cur); cur = String::new(); } } Ok((groups, comment)) } /// Describes a single UCD file. pub trait UcdFile: Clone + fmt::Debug + Default + Eq + FromStr + PartialEq { /// The file path corresponding to this file, relative to the UCD /// directory. fn relative_file_path() -> &'static Path; /// The full file path corresponding to this file given the UCD directory /// path. fn file_path>(ucd_dir: P) -> PathBuf { ucd_dir.as_ref().join(Self::relative_file_path()) } /// Create an iterator over each record in this UCD file. /// /// The parameter should correspond to the directory containing the UCD. fn from_dir>( ucd_dir: P, ) -> Result, Error> { UcdLineParser::from_path(Self::file_path(ucd_dir)) } } /// Describes a single UCD file where every record in the file is associated /// with one or more codepoints. pub trait UcdFileByCodepoint: UcdFile { /// Returns the codepoints associated with this record. fn codepoints(&self) -> CodepointIter; } /// A line oriented parser for a particular UCD file. /// /// Callers can build a line parser via the /// [`UcdFile::from_dir`](trait.UcdFile.html) method. /// /// The `R` type parameter refers to the underlying `io::Read` implementation /// from which the UCD data is read. /// /// The `D` type parameter refers to the type of the record parsed out of each /// line. #[derive(Debug)] pub struct UcdLineParser { path: Option, rdr: io::BufReader, line: String, line_number: u64, _data: PhantomData, } impl UcdLineParser { /// Create a new parser from the given file path. pub(crate) fn from_path>( path: P, ) -> Result, Error> { let path = path.as_ref(); let file = File::open(path).map_err(|e| Error { kind: ErrorKind::Io(e), line: None, path: Some(path.to_path_buf()), })?; Ok(UcdLineParser::new(Some(path.to_path_buf()), file)) } } impl UcdLineParser { /// Create a new parser that parses the reader given. /// /// The type of data parsed is determined when the `parse_next` function /// is called by virtue of the type requested. /// /// Note that the reader is buffered internally, so the caller does not /// need to provide their own buffering. pub(crate) fn new(path: Option, rdr: R) -> UcdLineParser { UcdLineParser { path, rdr: io::BufReader::new(rdr), line: String::new(), line_number: 0, _data: PhantomData, } } } impl> Iterator for UcdLineParser { type Item = Result; fn next(&mut self) -> Option> { loop { self.line_number += 1; self.line.clear(); let n = match self.rdr.read_line(&mut self.line) { Err(err) => { return Some(Err(Error { kind: ErrorKind::Io(err), line: None, path: self.path.clone(), })) } Ok(n) => n, }; if n == 0 { return None; } if !self.line.starts_with('#') && !self.line.trim().is_empty() { break; } } let line_number = self.line_number; Some(self.line.parse().map_err(|mut err: Error| { err.line = Some(line_number); err })) } } /// A representation of either a single codepoint or a range of codepoints. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] pub enum Codepoints { /// A single codepoint. Single(Codepoint), /// A range of codepoints. Range(CodepointRange), } impl Default for Codepoints { fn default() -> Codepoints { Codepoints::Single(Codepoint::default()) } } impl IntoIterator for Codepoints { type IntoIter = CodepointIter; type Item = Codepoint; fn into_iter(self) -> CodepointIter { match self { Codepoints::Single(x) => x.into_iter(), Codepoints::Range(x) => x.into_iter(), } } } impl FromStr for Codepoints { type Err = Error; fn from_str(s: &str) -> Result { if s.contains("..") { CodepointRange::from_str(s).map(Codepoints::Range) } else { Codepoint::from_str(s).map(Codepoints::Single) } } } impl fmt::Display for Codepoints { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Codepoints::Single(ref x) => x.fmt(f), Codepoints::Range(ref x) => x.fmt(f), } } } impl PartialEq for Codepoints { fn eq(&self, other: &u32) -> bool { match *self { Codepoints::Single(ref x) => x == other, Codepoints::Range(ref x) => x == &(*other, *other), } } } impl PartialEq for Codepoints { fn eq(&self, other: &Codepoint) -> bool { match *self { Codepoints::Single(ref x) => x == other, Codepoints::Range(ref x) => x == &(*other, *other), } } } impl PartialEq<(u32, u32)> for Codepoints { fn eq(&self, other: &(u32, u32)) -> bool { match *self { Codepoints::Single(ref x) => &(x.value(), x.value()) == other, Codepoints::Range(ref x) => x == other, } } } impl PartialEq<(Codepoint, Codepoint)> for Codepoints { fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { match *self { Codepoints::Single(ref x) => &(*x, *x) == other, Codepoints::Range(ref x) => x == other, } } } /// A range of Unicode codepoints. The range is inclusive; both ends of the /// range are guaranteed to be valid codepoints. #[derive( Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, )] pub struct CodepointRange { /// The start of the codepoint range. pub start: Codepoint, /// The end of the codepoint range. pub end: Codepoint, } impl IntoIterator for CodepointRange { type IntoIter = CodepointIter; type Item = Codepoint; fn into_iter(self) -> CodepointIter { CodepointIter { next: self.start.value(), range: self } } } impl FromStr for CodepointRange { type Err = Error; fn from_str(s: &str) -> Result { static PARTS: Lazy = Lazy::new(|| { Regex::new(r"^(?P[A-Z0-9]+)\.\.(?P[A-Z0-9]+)$") .unwrap() }); let caps = match PARTS.captures(s) { Some(caps) => caps, None => return err!("invalid codepoint range: '{}'", s), }; let start = caps["start"].parse().or_else(|err| { err!("failed to parse '{}' as a codepoint range: {}", s, err) })?; let end = caps["end"].parse().or_else(|err| { err!("failed to parse '{}' as a codepoint range: {}", s, err) })?; Ok(CodepointRange { start, end }) } } impl fmt::Display for CodepointRange { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}..{}", self.start, self.end) } } impl PartialEq<(u32, u32)> for CodepointRange { fn eq(&self, other: &(u32, u32)) -> bool { &(self.start.value(), self.end.value()) == other } } impl PartialEq<(Codepoint, Codepoint)> for CodepointRange { fn eq(&self, other: &(Codepoint, Codepoint)) -> bool { &(self.start, self.end) == other } } /// A single Unicode codepoint. /// /// This type's string representation is a hexadecimal number. It is guaranteed /// to be in the range `[0, 10FFFF]`. /// /// Note that unlike Rust's `char` type, this may be a surrogate codepoint. #[derive( Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, )] pub struct Codepoint(u32); impl Codepoint { /// Create a new codepoint from a `u32`. /// /// If the given number is not a valid codepoint, then this returns an /// error. pub fn from_u32(n: u32) -> Result { if n > 0x10FFFF { err!("{:x} is not a valid Unicode codepoint", n) } else { Ok(Codepoint(n)) } } /// Return the underlying `u32` codepoint value. pub fn value(self) -> u32 { self.0 } /// Attempt to convert this codepoint to a Unicode scalar value. /// /// If this is a surrogate codepoint, then this returns `None`. pub fn scalar(self) -> Option { char::from_u32(self.0) } } impl IntoIterator for Codepoint { type IntoIter = CodepointIter; type Item = Codepoint; fn into_iter(self) -> CodepointIter { let range = CodepointRange { start: self, end: self }; CodepointIter { next: self.value(), range } } } impl FromStr for Codepoint { type Err = Error; fn from_str(s: &str) -> Result { match u32::from_str_radix(s, 16) { Ok(n) => Codepoint::from_u32(n), Err(err) => { return err!( "failed to parse '{}' as a hexadecimal codepoint: {}", s, err ); } } } } impl fmt::Display for Codepoint { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:04X}", self.0) } } impl PartialEq for Codepoint { fn eq(&self, other: &u32) -> bool { self.0 == *other } } impl PartialEq for u32 { fn eq(&self, other: &Codepoint) -> bool { *self == other.0 } } /// An iterator over a range of Unicode codepoints. #[derive(Debug)] pub struct CodepointIter { next: u32, range: CodepointRange, } impl Iterator for CodepointIter { type Item = Codepoint; fn next(&mut self) -> Option { if self.next > self.range.end.value() { return None; } let current = self.next; self.next += 1; Some(Codepoint::from_u32(current).unwrap()) } }