use std::fmt; use std::iter; use std::ops::Range; use std::path::Path; use std::str::FromStr; use lazy_static::lazy_static; use regex::Regex; use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; use crate::error::Error; /// Represents a single row in the `UnicodeData.txt` file. /// /// These fields were taken from UAX44, Table 9, as part of the documentation /// for the /// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt). #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct UnicodeData { /// The codepoint corresponding to this row. pub codepoint: Codepoint, /// The name of this codepoint. pub name: String, /// The "general category" of this codepoint. pub general_category: String, /// The class of this codepoint used in the Canonical Ordering Algorithm. /// /// Note that some classes map to a particular symbol. See /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values). pub canonical_combining_class: u8, /// The bidirectional class of this codepoint. /// /// Possible values are listed in /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values). pub bidi_class: String, /// The decomposition mapping for this codepoint. This includes its /// formatting tag (if present). pub decomposition: UnicodeDataDecomposition, /// A decimal numeric representation of this codepoint, if it has the /// property `Numeric_Type=Decimal`. pub numeric_type_decimal: Option, /// A decimal numeric representation of this codepoint, if it has the /// property `Numeric_Type=Digit`. Note that while this field is still /// populated for existing codepoints, no new codepoints will have this /// field populated. pub numeric_type_digit: Option, /// A decimal or rational numeric representation of this codepoint, if it /// has the property `Numeric_Type=Numeric`. pub numeric_type_numeric: Option, /// A boolean indicating whether this codepoint is "mirrored" in /// bidirectional text. pub bidi_mirrored: bool, /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that /// this field is empty unless it is significantly different from /// the `name` field. pub unicode1_name: String, /// The ISO 10464 comment field. This no longer contains any non-NULL /// values. pub iso_comment: String, /// This codepoint's simple uppercase mapping, if it exists. pub simple_uppercase_mapping: Option, /// This codepoint's simple lowercase mapping, if it exists. pub simple_lowercase_mapping: Option, /// This codepoint's simple titlecase mapping, if it exists. pub simple_titlecase_mapping: Option, } impl UcdFile for UnicodeData { fn relative_file_path() -> &'static Path { Path::new("UnicodeData.txt") } } impl UcdFileByCodepoint for UnicodeData { fn codepoints(&self) -> CodepointIter { self.codepoint.into_iter() } } impl UnicodeData { /// Returns true if and only if this record corresponds to the start of a /// range. pub fn is_range_start(&self) -> bool { self.name.starts_with('<') && self.name.ends_with('>') && self.name.contains("First") } /// Returns true if and only if this record corresponds to the end of a /// range. pub fn is_range_end(&self) -> bool { self.name.starts_with('<') && self.name.ends_with('>') && self.name.contains("Last") } } impl FromStr for UnicodeData { type Err = Error; fn from_str(line: &str) -> Result { lazy_static! { static ref PARTS: Regex = Regex::new( r"(?x) ^ ([A-Z0-9]+); # 1; codepoint ([^;]+); # 2; name ([^;]+); # 3; general category ([0-9]+); # 4; canonical combining class ([^;]+); # 5; bidi class ([^;]*); # 6; decomposition ([0-9]*); # 7; numeric type decimal ([0-9]*); # 8; numeric type digit ([-0-9/]*); # 9; numeric type numeric ([YN]); # 10; bidi mirrored ([^;]*); # 11; unicode1 name ([^;]*); # 12; ISO comment ([^;]*); # 13; simple uppercase mapping ([^;]*); # 14; simple lowercase mapping ([^;]*) # 15; simple titlecase mapping $ " ) .unwrap(); }; let caps = match PARTS.captures(line.trim()) { Some(caps) => caps, None => return err!("invalid UnicodeData line"), }; let capget = |n| caps.get(n).unwrap().as_str(); let mut data = UnicodeData::default(); data.codepoint = capget(1).parse()?; data.name = capget(2).to_string(); data.general_category = capget(3).to_string(); data.canonical_combining_class = match capget(4).parse() { Ok(n) => n, Err(err) => { return err!( "failed to parse canonical combining class '{}': {}", capget(4), err ) } }; data.bidi_class = capget(5).to_string(); if !caps[6].is_empty() { data.decomposition = caps[6].parse()?; } else { data.decomposition.push(data.codepoint)?; } if !capget(7).is_empty() { data.numeric_type_decimal = Some(match capget(7).parse() { Ok(n) => n, Err(err) => { return err!( "failed to parse numeric type decimal '{}': {}", capget(7), err ) } }); } if !capget(8).is_empty() { data.numeric_type_digit = Some(match capget(8).parse() { Ok(n) => n, Err(err) => { return err!( "failed to parse numeric type digit '{}': {}", capget(8), err ) } }); } if !capget(9).is_empty() { data.numeric_type_numeric = Some(capget(9).parse()?); } data.bidi_mirrored = capget(10) == "Y"; data.unicode1_name = capget(11).to_string(); data.iso_comment = capget(12).to_string(); if !capget(13).is_empty() { data.simple_uppercase_mapping = Some(capget(13).parse()?); } if !capget(14).is_empty() { data.simple_lowercase_mapping = Some(capget(14).parse()?); } if !capget(15).is_empty() { data.simple_titlecase_mapping = Some(capget(15).parse()?); } Ok(data) } } impl fmt::Display for UnicodeData { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{};", self.codepoint)?; write!(f, "{};", self.name)?; write!(f, "{};", self.general_category)?; write!(f, "{};", self.canonical_combining_class)?; write!(f, "{};", self.bidi_class)?; if self.decomposition.is_canonical() && self.decomposition.mapping() == &[self.codepoint] { write!(f, ";")?; } else { write!(f, "{};", self.decomposition)?; } if let Some(n) = self.numeric_type_decimal { write!(f, "{};", n)?; } else { write!(f, ";")?; } if let Some(n) = self.numeric_type_digit { write!(f, "{};", n)?; } else { write!(f, ";")?; } if let Some(n) = self.numeric_type_numeric { write!(f, "{};", n)?; } else { write!(f, ";")?; } write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?; write!(f, "{};", self.unicode1_name)?; write!(f, "{};", self.iso_comment)?; if let Some(cp) = self.simple_uppercase_mapping { write!(f, "{};", cp)?; } else { write!(f, ";")?; } if let Some(cp) = self.simple_lowercase_mapping { write!(f, "{};", cp)?; } else { write!(f, ";")?; } if let Some(cp) = self.simple_titlecase_mapping { write!(f, "{}", cp)?; } Ok(()) } } /// Represents a decomposition mapping of a single row in the /// `UnicodeData.txt` file. #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct UnicodeDataDecomposition { /// The formatting tag associated with this mapping, if present. pub tag: Option, /// The number of codepoints in this mapping. pub len: usize, /// The codepoints in the mapping. Entries beyond `len` in the mapping /// are always U+0000. If no mapping was present, then this always contains /// a single codepoint corresponding to this row's character. pub mapping: [Codepoint; 18], } impl UnicodeDataDecomposition { /// Create a new decomposition mapping with the given tag and codepoints. /// /// If there are too many codepoints, then an error is returned. pub fn new( tag: Option, mapping: &[Codepoint], ) -> Result { let mut x = UnicodeDataDecomposition::default(); x.tag = tag; for &cp in mapping { x.push(cp)?; } Ok(x) } /// Add a new codepoint to this decomposition's mapping. /// /// If the mapping is already full, then this returns an error. pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> { if self.len >= self.mapping.len() { return err!( "invalid decomposition mapping (too many codepoints)" ); } self.mapping[self.len] = cp; self.len += 1; Ok(()) } /// Return the mapping as a slice of codepoints. The slice returned /// has length equivalent to the number of codepoints in this mapping. pub fn mapping(&self) -> &[Codepoint] { &self.mapping[..self.len] } /// Returns true if and only if this decomposition mapping is canonical. pub fn is_canonical(&self) -> bool { self.tag.is_none() } } impl FromStr for UnicodeDataDecomposition { type Err = Error; fn from_str(s: &str) -> Result { lazy_static! { static ref WITH_TAG: Regex = Regex::new( r"^(?:<(?P[^>]+)>)?\s*(?P[\s0-9A-F]+)$" ) .unwrap(); static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap(); }; if s.is_empty() { return err!( "expected non-empty string for \ UnicodeDataDecomposition value" ); } let caps = match WITH_TAG.captures(s) { Some(caps) => caps, None => return err!("invalid decomposition value"), }; let mut decomp = UnicodeDataDecomposition::default(); let mut codepoints = s; if let Some(m) = caps.name("tag") { decomp.tag = Some(m.as_str().parse()?); codepoints = &caps["chars"]; } for m in CHARS.find_iter(codepoints) { let cp = m.as_str().parse()?; decomp.push(cp)?; } Ok(decomp) } } impl fmt::Display for UnicodeDataDecomposition { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if let Some(ref tag) = self.tag { write!(f, "<{}> ", tag)?; } let mut first = true; for cp in self.mapping() { if !first { write!(f, " ")?; } first = false; write!(f, "{}", cp)?; } Ok(()) } } /// The formatting tag on a decomposition mapping. /// /// This is taken from /// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings). #[derive(Clone, Debug, Eq, PartialEq)] pub enum UnicodeDataDecompositionTag { /// Font, /// NoBreak, /// Initial, /// Medial, /// Final, /// Isolated, /// Circle, /// Super, /// Sub, /// Vertical, /// Wide, /// Narrow, /// Small, /// Square, /// Fraction, /// Compat, } impl FromStr for UnicodeDataDecompositionTag { type Err = Error; fn from_str(s: &str) -> Result { use self::UnicodeDataDecompositionTag::*; Ok(match s { "font" => Font, "noBreak" => NoBreak, "initial" => Initial, "medial" => Medial, "final" => Final, "isolated" => Isolated, "circle" => Circle, "super" => Super, "sub" => Sub, "vertical" => Vertical, "wide" => Wide, "narrow" => Narrow, "small" => Small, "square" => Square, "fraction" => Fraction, "compat" => Compat, _ => return err!("invalid decomposition formatting tag: {}", s), }) } } impl fmt::Display for UnicodeDataDecompositionTag { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::UnicodeDataDecompositionTag::*; let s = match *self { Font => "font", NoBreak => "noBreak", Initial => "initial", Medial => "medial", Final => "final", Isolated => "isolated", Circle => "circle", Super => "super", Sub => "sub", Vertical => "vertical", Wide => "wide", Narrow => "narrow", Small => "small", Square => "square", Fraction => "fraction", Compat => "compat", }; write!(f, "{}", s) } } /// A numeric value corresponding to characters with `Numeric_Type=Numeric`. /// /// A numeric value can either be a signed integer or a rational number. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum UnicodeDataNumeric { /// An integer. Integer(i64), /// A rational number. The first is the numerator and the latter is the /// denominator. Rational(i64, i64), } impl FromStr for UnicodeDataNumeric { type Err = Error; fn from_str(s: &str) -> Result { if s.is_empty() { return err!( "expected non-empty string for UnicodeDataNumeric value" ); } if let Some(pos) = s.find('/') { let (snum, sden) = (&s[..pos], &s[pos + 1..]); let num = match snum.parse() { Ok(num) => num, Err(err) => { return err!( "invalid integer numerator '{}': {}", snum, err ); } }; let den = match sden.parse() { Ok(den) => den, Err(err) => { return err!( "invalid integer denominator '{}': {}", sden, err ); } }; Ok(UnicodeDataNumeric::Rational(num, den)) } else { match s.parse() { Ok(den) => Ok(UnicodeDataNumeric::Integer(den)), Err(err) => { return err!( "invalid integer denominator '{}': {}", s, err ); } } } } } impl fmt::Display for UnicodeDataNumeric { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { UnicodeDataNumeric::Integer(n) => write!(f, "{}", n), UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d), } } } /// An iterator adapter that expands rows in `UnicodeData.txt`. /// /// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly /// represented. Instead, they are represented by a pair of rows, indicating /// a range of codepoints with the same properties. For example, the Hangul /// syllable codepoints are represented by these two rows: /// /// ```ignore /// AC00;;Lo;0;L;;;;;N;;;;; /// D7A3;;Lo;0;L;;;;;N;;;;; /// ``` /// /// This iterator will wrap any iterator of `UnicodeData` and, when a range of /// Unicode codepoints is found, it will be expanded to the appropriate /// sequence of `UnicodeData` values. Note that all such expanded records will /// have an empty name. pub struct UnicodeDataExpander { /// The underlying iterator. it: iter::Peekable, /// A range of codepoints to emit when we've found a pair. Otherwise, /// `None`. range: CodepointRange, } struct CodepointRange { /// The codepoint range. range: Range, /// The start record. All subsequent records in this range are generated /// by cloning this and updating the codepoint/name. start_record: UnicodeData, } impl> UnicodeDataExpander { /// Create a new iterator that expands pairs of `UnicodeData` range /// records. All other records are passed through as-is. pub fn new(it: T) -> UnicodeDataExpander where T: IntoIterator, { UnicodeDataExpander { it: it.into_iter().peekable(), range: CodepointRange { range: 0..0, start_record: UnicodeData::default(), }, } } } impl> Iterator for UnicodeDataExpander { type Item = UnicodeData; fn next(&mut self) -> Option { if let Some(udata) = self.range.next() { return Some(udata); } let row1 = match self.it.next() { None => return None, Some(row1) => row1, }; if !row1.is_range_start() || !self.it.peek().map_or(false, |row2| row2.is_range_end()) { return Some(row1); } let row2 = self.it.next().unwrap(); self.range = CodepointRange { range: row1.codepoint.value()..(row2.codepoint.value() + 1), start_record: row1, }; self.next() } } impl Iterator for CodepointRange { type Item = UnicodeData; fn next(&mut self) -> Option { let cp = match self.range.next() { None => return None, Some(cp) => cp, }; Some(UnicodeData { codepoint: Codepoint::from_u32(cp).unwrap(), name: "".to_string(), ..self.start_record.clone() }) } } #[cfg(test)] mod tests { use crate::common::Codepoint; use super::{ UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, UnicodeDataNumeric, }; fn codepoint(n: u32) -> Codepoint { Codepoint::from_u32(n).unwrap() } fn s(string: &str) -> String { string.to_string() } #[test] fn parse1() { let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L; 0028 0062 0029;;;;N;;;;;\n"; let data: UnicodeData = line.parse().unwrap(); assert_eq!( data, UnicodeData { codepoint: codepoint(0x249d), name: s("PARENTHESIZED LATIN SMALL LETTER B"), general_category: s("So"), canonical_combining_class: 0, bidi_class: s("L"), decomposition: UnicodeDataDecomposition::new( Some(UnicodeDataDecompositionTag::Compat), &[codepoint(0x28), codepoint(0x62), codepoint(0x29)], ) .unwrap(), numeric_type_decimal: None, numeric_type_digit: None, numeric_type_numeric: None, bidi_mirrored: false, unicode1_name: s(""), iso_comment: s(""), simple_uppercase_mapping: None, simple_lowercase_mapping: None, simple_titlecase_mapping: None, } ); } #[test] fn parse2() { let line = "000D;;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n"; let data: UnicodeData = line.parse().unwrap(); assert_eq!( data, UnicodeData { codepoint: codepoint(0x000D), name: s(""), general_category: s("Cc"), canonical_combining_class: 0, bidi_class: s("B"), decomposition: UnicodeDataDecomposition::new( None, &[codepoint(0x000D)] ) .unwrap(), numeric_type_decimal: None, numeric_type_digit: None, numeric_type_numeric: None, bidi_mirrored: false, unicode1_name: s("CARRIAGE RETURN (CR)"), iso_comment: s(""), simple_uppercase_mapping: None, simple_lowercase_mapping: None, simple_titlecase_mapping: None, } ); } #[test] fn parse3() { let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON; 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n"; let data: UnicodeData = line.parse().unwrap(); assert_eq!( data, UnicodeData { codepoint: codepoint(0x00BC), name: s("VULGAR FRACTION ONE QUARTER"), general_category: s("No"), canonical_combining_class: 0, bidi_class: s("ON"), decomposition: UnicodeDataDecomposition::new( Some(UnicodeDataDecompositionTag::Fraction), &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)], ) .unwrap(), numeric_type_decimal: None, numeric_type_digit: None, numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)), bidi_mirrored: false, unicode1_name: s("FRACTION ONE QUARTER"), iso_comment: s(""), simple_uppercase_mapping: None, simple_lowercase_mapping: None, simple_titlecase_mapping: None, } ); } #[test] fn parse4() { let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n"; let data: UnicodeData = line.parse().unwrap(); assert_eq!( data, UnicodeData { codepoint: codepoint(0x0041), name: s("LATIN CAPITAL LETTER A"), general_category: s("Lu"), canonical_combining_class: 0, bidi_class: s("L"), decomposition: UnicodeDataDecomposition::new( None, &[codepoint(0x0041)] ) .unwrap(), numeric_type_decimal: None, numeric_type_digit: None, numeric_type_numeric: None, bidi_mirrored: false, unicode1_name: s(""), iso_comment: s(""), simple_uppercase_mapping: None, simple_lowercase_mapping: Some(codepoint(0x0061)), simple_titlecase_mapping: None, } ); } #[test] fn parse5() { let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n"; let data: UnicodeData = line.parse().unwrap(); assert_eq!( data, UnicodeData { codepoint: codepoint(0x0F33), name: s("TIBETAN DIGIT HALF ZERO"), general_category: s("No"), canonical_combining_class: 0, bidi_class: s("L"), decomposition: UnicodeDataDecomposition::new( None, &[codepoint(0x0F33)] ) .unwrap(), numeric_type_decimal: None, numeric_type_digit: None, numeric_type_numeric: Some(UnicodeDataNumeric::Rational( -1, 2 )), bidi_mirrored: false, unicode1_name: s(""), iso_comment: s(""), simple_uppercase_mapping: None, simple_lowercase_mapping: None, simple_titlecase_mapping: None, } ); } #[test] fn expander() { use super::UnicodeDataExpander; use crate::common::UcdLineParser; let data = "\ ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; AC00;;Lo;0;L;;;;;N;;;;; D7A3;;Lo;0;L;;;;;N;;;;; D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;; "; let records = UcdLineParser::new(None, data.as_bytes()) .collect::, _>>() .unwrap(); assert_eq!(UnicodeDataExpander::new(records).count(), 11174); } }