diff options
Diffstat (limited to 'vendor/ucd-parse/src/unicode_data.rs')
-rw-r--r-- | vendor/ucd-parse/src/unicode_data.rs | 787 |
1 files changed, 787 insertions, 0 deletions
diff --git a/vendor/ucd-parse/src/unicode_data.rs b/vendor/ucd-parse/src/unicode_data.rs new file mode 100644 index 000000000..87910cc1d --- /dev/null +++ b/vendor/ucd-parse/src/unicode_data.rs @@ -0,0 +1,787 @@ +use std::fmt; +use std::iter; +use std::ops::Range; +use std::path::Path; +use std::str::FromStr; + +use lazy_static::lazy_static; +use regex::Regex; + +use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; +use crate::error::Error; + +/// Represents a single row in the `UnicodeData.txt` file. +/// +/// These fields were taken from UAX44, Table 9, as part of the documentation +/// for the +/// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt). +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct UnicodeData { + /// The codepoint corresponding to this row. + pub codepoint: Codepoint, + /// The name of this codepoint. + pub name: String, + /// The "general category" of this codepoint. + pub general_category: String, + /// The class of this codepoint used in the Canonical Ordering Algorithm. + /// + /// Note that some classes map to a particular symbol. See + /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values). + pub canonical_combining_class: u8, + /// The bidirectional class of this codepoint. + /// + /// Possible values are listed in + /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values). + pub bidi_class: String, + /// The decomposition mapping for this codepoint. This includes its + /// formatting tag (if present). + pub decomposition: UnicodeDataDecomposition, + /// A decimal numeric representation of this codepoint, if it has the + /// property `Numeric_Type=Decimal`. + pub numeric_type_decimal: Option<u8>, + /// A decimal numeric representation of this codepoint, if it has the + /// property `Numeric_Type=Digit`. Note that while this field is still + /// populated for existing codepoints, no new codepoints will have this + /// field populated. + pub numeric_type_digit: Option<u8>, + /// A decimal or rational numeric representation of this codepoint, if it + /// has the property `Numeric_Type=Numeric`. + pub numeric_type_numeric: Option<UnicodeDataNumeric>, + /// A boolean indicating whether this codepoint is "mirrored" in + /// bidirectional text. + pub bidi_mirrored: bool, + /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that + /// this field is empty unless it is significantly different from + /// the `name` field. + pub unicode1_name: String, + /// The ISO 10464 comment field. This no longer contains any non-NULL + /// values. + pub iso_comment: String, + /// This codepoint's simple uppercase mapping, if it exists. + pub simple_uppercase_mapping: Option<Codepoint>, + /// This codepoint's simple lowercase mapping, if it exists. + pub simple_lowercase_mapping: Option<Codepoint>, + /// This codepoint's simple titlecase mapping, if it exists. + pub simple_titlecase_mapping: Option<Codepoint>, +} + +impl UcdFile for UnicodeData { + fn relative_file_path() -> &'static Path { + Path::new("UnicodeData.txt") + } +} + +impl UcdFileByCodepoint for UnicodeData { + fn codepoints(&self) -> CodepointIter { + self.codepoint.into_iter() + } +} + +impl UnicodeData { + /// Returns true if and only if this record corresponds to the start of a + /// range. + pub fn is_range_start(&self) -> bool { + self.name.starts_with('<') + && self.name.ends_with('>') + && self.name.contains("First") + } + + /// Returns true if and only if this record corresponds to the end of a + /// range. + pub fn is_range_end(&self) -> bool { + self.name.starts_with('<') + && self.name.ends_with('>') + && self.name.contains("Last") + } +} + +impl FromStr for UnicodeData { + type Err = Error; + + fn from_str(line: &str) -> Result<UnicodeData, Error> { + lazy_static! { + static ref PARTS: Regex = Regex::new( + r"(?x) + ^ + ([A-Z0-9]+); # 1; codepoint + ([^;]+); # 2; name + ([^;]+); # 3; general category + ([0-9]+); # 4; canonical combining class + ([^;]+); # 5; bidi class + ([^;]*); # 6; decomposition + ([0-9]*); # 7; numeric type decimal + ([0-9]*); # 8; numeric type digit + ([-0-9/]*); # 9; numeric type numeric + ([YN]); # 10; bidi mirrored + ([^;]*); # 11; unicode1 name + ([^;]*); # 12; ISO comment + ([^;]*); # 13; simple uppercase mapping + ([^;]*); # 14; simple lowercase mapping + ([^;]*) # 15; simple titlecase mapping + $ + " + ) + .unwrap(); + }; + let caps = match PARTS.captures(line.trim()) { + Some(caps) => caps, + None => return err!("invalid UnicodeData line"), + }; + let capget = |n| caps.get(n).unwrap().as_str(); + let mut data = UnicodeData::default(); + + data.codepoint = capget(1).parse()?; + data.name = capget(2).to_string(); + data.general_category = capget(3).to_string(); + data.canonical_combining_class = match capget(4).parse() { + Ok(n) => n, + Err(err) => { + return err!( + "failed to parse canonical combining class '{}': {}", + capget(4), + err + ) + } + }; + data.bidi_class = capget(5).to_string(); + if !caps[6].is_empty() { + data.decomposition = caps[6].parse()?; + } else { + data.decomposition.push(data.codepoint)?; + } + if !capget(7).is_empty() { + data.numeric_type_decimal = Some(match capget(7).parse() { + Ok(n) => n, + Err(err) => { + return err!( + "failed to parse numeric type decimal '{}': {}", + capget(7), + err + ) + } + }); + } + if !capget(8).is_empty() { + data.numeric_type_digit = Some(match capget(8).parse() { + Ok(n) => n, + Err(err) => { + return err!( + "failed to parse numeric type digit '{}': {}", + capget(8), + err + ) + } + }); + } + if !capget(9).is_empty() { + data.numeric_type_numeric = Some(capget(9).parse()?); + } + data.bidi_mirrored = capget(10) == "Y"; + data.unicode1_name = capget(11).to_string(); + data.iso_comment = capget(12).to_string(); + if !capget(13).is_empty() { + data.simple_uppercase_mapping = Some(capget(13).parse()?); + } + if !capget(14).is_empty() { + data.simple_lowercase_mapping = Some(capget(14).parse()?); + } + if !capget(15).is_empty() { + data.simple_titlecase_mapping = Some(capget(15).parse()?); + } + Ok(data) + } +} + +impl fmt::Display for UnicodeData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{};", self.codepoint)?; + write!(f, "{};", self.name)?; + write!(f, "{};", self.general_category)?; + write!(f, "{};", self.canonical_combining_class)?; + write!(f, "{};", self.bidi_class)?; + if self.decomposition.is_canonical() + && self.decomposition.mapping() == &[self.codepoint] + { + write!(f, ";")?; + } else { + write!(f, "{};", self.decomposition)?; + } + if let Some(n) = self.numeric_type_decimal { + write!(f, "{};", n)?; + } else { + write!(f, ";")?; + } + if let Some(n) = self.numeric_type_digit { + write!(f, "{};", n)?; + } else { + write!(f, ";")?; + } + if let Some(n) = self.numeric_type_numeric { + write!(f, "{};", n)?; + } else { + write!(f, ";")?; + } + write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?; + write!(f, "{};", self.unicode1_name)?; + write!(f, "{};", self.iso_comment)?; + if let Some(cp) = self.simple_uppercase_mapping { + write!(f, "{};", cp)?; + } else { + write!(f, ";")?; + } + if let Some(cp) = self.simple_lowercase_mapping { + write!(f, "{};", cp)?; + } else { + write!(f, ";")?; + } + if let Some(cp) = self.simple_titlecase_mapping { + write!(f, "{}", cp)?; + } + Ok(()) + } +} + +/// Represents a decomposition mapping of a single row in the +/// `UnicodeData.txt` file. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct UnicodeDataDecomposition { + /// The formatting tag associated with this mapping, if present. + pub tag: Option<UnicodeDataDecompositionTag>, + /// The number of codepoints in this mapping. + pub len: usize, + /// The codepoints in the mapping. Entries beyond `len` in the mapping + /// are always U+0000. If no mapping was present, then this always contains + /// a single codepoint corresponding to this row's character. + pub mapping: [Codepoint; 18], +} + +impl UnicodeDataDecomposition { + /// Create a new decomposition mapping with the given tag and codepoints. + /// + /// If there are too many codepoints, then an error is returned. + pub fn new( + tag: Option<UnicodeDataDecompositionTag>, + mapping: &[Codepoint], + ) -> Result<UnicodeDataDecomposition, Error> { + let mut x = UnicodeDataDecomposition::default(); + x.tag = tag; + for &cp in mapping { + x.push(cp)?; + } + Ok(x) + } + + /// Add a new codepoint to this decomposition's mapping. + /// + /// If the mapping is already full, then this returns an error. + pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> { + if self.len >= self.mapping.len() { + return err!( + "invalid decomposition mapping (too many codepoints)" + ); + } + self.mapping[self.len] = cp; + self.len += 1; + Ok(()) + } + + /// Return the mapping as a slice of codepoints. The slice returned + /// has length equivalent to the number of codepoints in this mapping. + pub fn mapping(&self) -> &[Codepoint] { + &self.mapping[..self.len] + } + + /// Returns true if and only if this decomposition mapping is canonical. + pub fn is_canonical(&self) -> bool { + self.tag.is_none() + } +} + +impl FromStr for UnicodeDataDecomposition { + type Err = Error; + + fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> { + lazy_static! { + static ref WITH_TAG: Regex = Regex::new( + r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$" + ) + .unwrap(); + static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap(); + }; + if s.is_empty() { + return err!( + "expected non-empty string for \ + UnicodeDataDecomposition value" + ); + } + let caps = match WITH_TAG.captures(s) { + Some(caps) => caps, + None => return err!("invalid decomposition value"), + }; + let mut decomp = UnicodeDataDecomposition::default(); + let mut codepoints = s; + if let Some(m) = caps.name("tag") { + decomp.tag = Some(m.as_str().parse()?); + codepoints = &caps["chars"]; + } + for m in CHARS.find_iter(codepoints) { + let cp = m.as_str().parse()?; + decomp.push(cp)?; + } + Ok(decomp) + } +} + +impl fmt::Display for UnicodeDataDecomposition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if let Some(ref tag) = self.tag { + write!(f, "<{}> ", tag)?; + } + let mut first = true; + for cp in self.mapping() { + if !first { + write!(f, " ")?; + } + first = false; + write!(f, "{}", cp)?; + } + Ok(()) + } +} + +/// The formatting tag on a decomposition mapping. +/// +/// This is taken from +/// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings). +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum UnicodeDataDecompositionTag { + /// <font> + Font, + /// <noBreak> + NoBreak, + /// <initial> + Initial, + /// <medial> + Medial, + /// <final> + Final, + /// <isolated> + Isolated, + /// <circle> + Circle, + /// <super> + Super, + /// <sub> + Sub, + /// <vertical> + Vertical, + /// <wide> + Wide, + /// <narrow> + Narrow, + /// <small> + Small, + /// <square> + Square, + /// <fraction> + Fraction, + /// <compat> + Compat, +} + +impl FromStr for UnicodeDataDecompositionTag { + type Err = Error; + + fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> { + use self::UnicodeDataDecompositionTag::*; + Ok(match s { + "font" => Font, + "noBreak" => NoBreak, + "initial" => Initial, + "medial" => Medial, + "final" => Final, + "isolated" => Isolated, + "circle" => Circle, + "super" => Super, + "sub" => Sub, + "vertical" => Vertical, + "wide" => Wide, + "narrow" => Narrow, + "small" => Small, + "square" => Square, + "fraction" => Fraction, + "compat" => Compat, + _ => return err!("invalid decomposition formatting tag: {}", s), + }) + } +} + +impl fmt::Display for UnicodeDataDecompositionTag { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use self::UnicodeDataDecompositionTag::*; + let s = match *self { + Font => "font", + NoBreak => "noBreak", + Initial => "initial", + Medial => "medial", + Final => "final", + Isolated => "isolated", + Circle => "circle", + Super => "super", + Sub => "sub", + Vertical => "vertical", + Wide => "wide", + Narrow => "narrow", + Small => "small", + Square => "square", + Fraction => "fraction", + Compat => "compat", + }; + write!(f, "{}", s) + } +} + +/// A numeric value corresponding to characters with `Numeric_Type=Numeric`. +/// +/// A numeric value can either be a signed integer or a rational number. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum UnicodeDataNumeric { + /// An integer. + Integer(i64), + /// A rational number. The first is the numerator and the latter is the + /// denominator. + Rational(i64, i64), +} + +impl FromStr for UnicodeDataNumeric { + type Err = Error; + + fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> { + if s.is_empty() { + return err!( + "expected non-empty string for UnicodeDataNumeric value" + ); + } + if let Some(pos) = s.find('/') { + let (snum, sden) = (&s[..pos], &s[pos + 1..]); + let num = match snum.parse() { + Ok(num) => num, + Err(err) => { + return err!( + "invalid integer numerator '{}': {}", + snum, + err + ); + } + }; + let den = match sden.parse() { + Ok(den) => den, + Err(err) => { + return err!( + "invalid integer denominator '{}': {}", + sden, + err + ); + } + }; + Ok(UnicodeDataNumeric::Rational(num, den)) + } else { + match s.parse() { + Ok(den) => Ok(UnicodeDataNumeric::Integer(den)), + Err(err) => { + return err!( + "invalid integer denominator '{}': {}", + s, + err + ); + } + } + } + } +} + +impl fmt::Display for UnicodeDataNumeric { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + UnicodeDataNumeric::Integer(n) => write!(f, "{}", n), + UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d), + } + } +} + +/// An iterator adapter that expands rows in `UnicodeData.txt`. +/// +/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly +/// represented. Instead, they are represented by a pair of rows, indicating +/// a range of codepoints with the same properties. For example, the Hangul +/// syllable codepoints are represented by these two rows: +/// +/// ```ignore +/// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; +/// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; +/// ``` +/// +/// This iterator will wrap any iterator of `UnicodeData` and, when a range of +/// Unicode codepoints is found, it will be expanded to the appropriate +/// sequence of `UnicodeData` values. Note that all such expanded records will +/// have an empty name. +pub struct UnicodeDataExpander<I: Iterator> { + /// The underlying iterator. + it: iter::Peekable<I>, + /// A range of codepoints to emit when we've found a pair. Otherwise, + /// `None`. + range: CodepointRange, +} + +struct CodepointRange { + /// The codepoint range. + range: Range<u32>, + /// The start record. All subsequent records in this range are generated + /// by cloning this and updating the codepoint/name. + start_record: UnicodeData, +} + +impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> { + /// Create a new iterator that expands pairs of `UnicodeData` range + /// records. All other records are passed through as-is. + pub fn new<T>(it: T) -> UnicodeDataExpander<I> + where + T: IntoIterator<IntoIter = I, Item = I::Item>, + { + UnicodeDataExpander { + it: it.into_iter().peekable(), + range: CodepointRange { + range: 0..0, + start_record: UnicodeData::default(), + }, + } + } +} + +impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> { + type Item = UnicodeData; + + fn next(&mut self) -> Option<UnicodeData> { + if let Some(udata) = self.range.next() { + return Some(udata); + } + let row1 = match self.it.next() { + None => return None, + Some(row1) => row1, + }; + if !row1.is_range_start() + || !self.it.peek().map_or(false, |row2| row2.is_range_end()) + { + return Some(row1); + } + let row2 = self.it.next().unwrap(); + self.range = CodepointRange { + range: row1.codepoint.value()..(row2.codepoint.value() + 1), + start_record: row1, + }; + self.next() + } +} + +impl Iterator for CodepointRange { + type Item = UnicodeData; + + fn next(&mut self) -> Option<UnicodeData> { + let cp = match self.range.next() { + None => return None, + Some(cp) => cp, + }; + Some(UnicodeData { + codepoint: Codepoint::from_u32(cp).unwrap(), + name: "".to_string(), + ..self.start_record.clone() + }) + } +} + +#[cfg(test)] +mod tests { + use crate::common::Codepoint; + + use super::{ + UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, + UnicodeDataNumeric, + }; + + fn codepoint(n: u32) -> Codepoint { + Codepoint::from_u32(n).unwrap() + } + + fn s(string: &str) -> String { + string.to_string() + } + + #[test] + fn parse1() { + let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x249d), + name: s("PARENTHESIZED LATIN SMALL LETTER B"), + general_category: s("So"), + canonical_combining_class: 0, + bidi_class: s("L"), + decomposition: UnicodeDataDecomposition::new( + Some(UnicodeDataDecompositionTag::Compat), + &[codepoint(0x28), codepoint(0x62), codepoint(0x29)], + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: None, + bidi_mirrored: false, + unicode1_name: s(""), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: None, + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn parse2() { + let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x000D), + name: s("<control>"), + general_category: s("Cc"), + canonical_combining_class: 0, + bidi_class: s("B"), + decomposition: UnicodeDataDecomposition::new( + None, + &[codepoint(0x000D)] + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: None, + bidi_mirrored: false, + unicode1_name: s("CARRIAGE RETURN (CR)"), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: None, + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn parse3() { + let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x00BC), + name: s("VULGAR FRACTION ONE QUARTER"), + general_category: s("No"), + canonical_combining_class: 0, + bidi_class: s("ON"), + decomposition: UnicodeDataDecomposition::new( + Some(UnicodeDataDecompositionTag::Fraction), + &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)], + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)), + bidi_mirrored: false, + unicode1_name: s("FRACTION ONE QUARTER"), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: None, + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn parse4() { + let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x0041), + name: s("LATIN CAPITAL LETTER A"), + general_category: s("Lu"), + canonical_combining_class: 0, + bidi_class: s("L"), + decomposition: UnicodeDataDecomposition::new( + None, + &[codepoint(0x0041)] + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: None, + bidi_mirrored: false, + unicode1_name: s(""), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: Some(codepoint(0x0061)), + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn parse5() { + let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n"; + let data: UnicodeData = line.parse().unwrap(); + assert_eq!( + data, + UnicodeData { + codepoint: codepoint(0x0F33), + name: s("TIBETAN DIGIT HALF ZERO"), + general_category: s("No"), + canonical_combining_class: 0, + bidi_class: s("L"), + decomposition: UnicodeDataDecomposition::new( + None, + &[codepoint(0x0F33)] + ) + .unwrap(), + numeric_type_decimal: None, + numeric_type_digit: None, + numeric_type_numeric: Some(UnicodeDataNumeric::Rational( + -1, 2 + )), + bidi_mirrored: false, + unicode1_name: s(""), + iso_comment: s(""), + simple_uppercase_mapping: None, + simple_lowercase_mapping: None, + simple_titlecase_mapping: None, + } + ); + } + + #[test] + fn expander() { + use super::UnicodeDataExpander; + use crate::common::UcdLineParser; + + let data = "\ +ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; +AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; +D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; +D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;; +"; + let records = UcdLineParser::new(None, data.as_bytes()) + .collect::<Result<Vec<_>, _>>() + .unwrap(); + assert_eq!(UnicodeDataExpander::new(records).count(), 11174); + } +} |