summaryrefslogtreecommitdiffstats
path: root/vendor/ucd-parse/src/unicode_data.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/ucd-parse/src/unicode_data.rs')
-rw-r--r--vendor/ucd-parse/src/unicode_data.rs787
1 files changed, 787 insertions, 0 deletions
diff --git a/vendor/ucd-parse/src/unicode_data.rs b/vendor/ucd-parse/src/unicode_data.rs
new file mode 100644
index 000000000..87910cc1d
--- /dev/null
+++ b/vendor/ucd-parse/src/unicode_data.rs
@@ -0,0 +1,787 @@
+use std::fmt;
+use std::iter;
+use std::ops::Range;
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// Represents a single row in the `UnicodeData.txt` file.
+///
+/// These fields were taken from UAX44, Table 9, as part of the documentation
+/// for the
+/// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt).
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct UnicodeData {
+ /// The codepoint corresponding to this row.
+ pub codepoint: Codepoint,
+ /// The name of this codepoint.
+ pub name: String,
+ /// The "general category" of this codepoint.
+ pub general_category: String,
+ /// The class of this codepoint used in the Canonical Ordering Algorithm.
+ ///
+ /// Note that some classes map to a particular symbol. See
+ /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
+ pub canonical_combining_class: u8,
+ /// The bidirectional class of this codepoint.
+ ///
+ /// Possible values are listed in
+ /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values).
+ pub bidi_class: String,
+ /// The decomposition mapping for this codepoint. This includes its
+ /// formatting tag (if present).
+ pub decomposition: UnicodeDataDecomposition,
+ /// A decimal numeric representation of this codepoint, if it has the
+ /// property `Numeric_Type=Decimal`.
+ pub numeric_type_decimal: Option<u8>,
+ /// A decimal numeric representation of this codepoint, if it has the
+ /// property `Numeric_Type=Digit`. Note that while this field is still
+ /// populated for existing codepoints, no new codepoints will have this
+ /// field populated.
+ pub numeric_type_digit: Option<u8>,
+ /// A decimal or rational numeric representation of this codepoint, if it
+ /// has the property `Numeric_Type=Numeric`.
+ pub numeric_type_numeric: Option<UnicodeDataNumeric>,
+ /// A boolean indicating whether this codepoint is "mirrored" in
+ /// bidirectional text.
+ pub bidi_mirrored: bool,
+ /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
+ /// this field is empty unless it is significantly different from
+ /// the `name` field.
+ pub unicode1_name: String,
+ /// The ISO 10464 comment field. This no longer contains any non-NULL
+ /// values.
+ pub iso_comment: String,
+ /// This codepoint's simple uppercase mapping, if it exists.
+ pub simple_uppercase_mapping: Option<Codepoint>,
+ /// This codepoint's simple lowercase mapping, if it exists.
+ pub simple_lowercase_mapping: Option<Codepoint>,
+ /// This codepoint's simple titlecase mapping, if it exists.
+ pub simple_titlecase_mapping: Option<Codepoint>,
+}
+
+impl UcdFile for UnicodeData {
+ fn relative_file_path() -> &'static Path {
+ Path::new("UnicodeData.txt")
+ }
+}
+
+impl UcdFileByCodepoint for UnicodeData {
+ fn codepoints(&self) -> CodepointIter {
+ self.codepoint.into_iter()
+ }
+}
+
+impl UnicodeData {
+ /// Returns true if and only if this record corresponds to the start of a
+ /// range.
+ pub fn is_range_start(&self) -> bool {
+ self.name.starts_with('<')
+ && self.name.ends_with('>')
+ && self.name.contains("First")
+ }
+
+ /// Returns true if and only if this record corresponds to the end of a
+ /// range.
+ pub fn is_range_end(&self) -> bool {
+ self.name.starts_with('<')
+ && self.name.ends_with('>')
+ && self.name.contains("Last")
+ }
+}
+
+impl FromStr for UnicodeData {
+ type Err = Error;
+
+ fn from_str(line: &str) -> Result<UnicodeData, Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ ([A-Z0-9]+); # 1; codepoint
+ ([^;]+); # 2; name
+ ([^;]+); # 3; general category
+ ([0-9]+); # 4; canonical combining class
+ ([^;]+); # 5; bidi class
+ ([^;]*); # 6; decomposition
+ ([0-9]*); # 7; numeric type decimal
+ ([0-9]*); # 8; numeric type digit
+ ([-0-9/]*); # 9; numeric type numeric
+ ([YN]); # 10; bidi mirrored
+ ([^;]*); # 11; unicode1 name
+ ([^;]*); # 12; ISO comment
+ ([^;]*); # 13; simple uppercase mapping
+ ([^;]*); # 14; simple lowercase mapping
+ ([^;]*) # 15; simple titlecase mapping
+ $
+ "
+ )
+ .unwrap();
+ };
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid UnicodeData line"),
+ };
+ let capget = |n| caps.get(n).unwrap().as_str();
+ let mut data = UnicodeData::default();
+
+ data.codepoint = capget(1).parse()?;
+ data.name = capget(2).to_string();
+ data.general_category = capget(3).to_string();
+ data.canonical_combining_class = match capget(4).parse() {
+ Ok(n) => n,
+ Err(err) => {
+ return err!(
+ "failed to parse canonical combining class '{}': {}",
+ capget(4),
+ err
+ )
+ }
+ };
+ data.bidi_class = capget(5).to_string();
+ if !caps[6].is_empty() {
+ data.decomposition = caps[6].parse()?;
+ } else {
+ data.decomposition.push(data.codepoint)?;
+ }
+ if !capget(7).is_empty() {
+ data.numeric_type_decimal = Some(match capget(7).parse() {
+ Ok(n) => n,
+ Err(err) => {
+ return err!(
+ "failed to parse numeric type decimal '{}': {}",
+ capget(7),
+ err
+ )
+ }
+ });
+ }
+ if !capget(8).is_empty() {
+ data.numeric_type_digit = Some(match capget(8).parse() {
+ Ok(n) => n,
+ Err(err) => {
+ return err!(
+ "failed to parse numeric type digit '{}': {}",
+ capget(8),
+ err
+ )
+ }
+ });
+ }
+ if !capget(9).is_empty() {
+ data.numeric_type_numeric = Some(capget(9).parse()?);
+ }
+ data.bidi_mirrored = capget(10) == "Y";
+ data.unicode1_name = capget(11).to_string();
+ data.iso_comment = capget(12).to_string();
+ if !capget(13).is_empty() {
+ data.simple_uppercase_mapping = Some(capget(13).parse()?);
+ }
+ if !capget(14).is_empty() {
+ data.simple_lowercase_mapping = Some(capget(14).parse()?);
+ }
+ if !capget(15).is_empty() {
+ data.simple_titlecase_mapping = Some(capget(15).parse()?);
+ }
+ Ok(data)
+ }
+}
+
+impl fmt::Display for UnicodeData {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{};", self.codepoint)?;
+ write!(f, "{};", self.name)?;
+ write!(f, "{};", self.general_category)?;
+ write!(f, "{};", self.canonical_combining_class)?;
+ write!(f, "{};", self.bidi_class)?;
+ if self.decomposition.is_canonical()
+ && self.decomposition.mapping() == &[self.codepoint]
+ {
+ write!(f, ";")?;
+ } else {
+ write!(f, "{};", self.decomposition)?;
+ }
+ if let Some(n) = self.numeric_type_decimal {
+ write!(f, "{};", n)?;
+ } else {
+ write!(f, ";")?;
+ }
+ if let Some(n) = self.numeric_type_digit {
+ write!(f, "{};", n)?;
+ } else {
+ write!(f, ";")?;
+ }
+ if let Some(n) = self.numeric_type_numeric {
+ write!(f, "{};", n)?;
+ } else {
+ write!(f, ";")?;
+ }
+ write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
+ write!(f, "{};", self.unicode1_name)?;
+ write!(f, "{};", self.iso_comment)?;
+ if let Some(cp) = self.simple_uppercase_mapping {
+ write!(f, "{};", cp)?;
+ } else {
+ write!(f, ";")?;
+ }
+ if let Some(cp) = self.simple_lowercase_mapping {
+ write!(f, "{};", cp)?;
+ } else {
+ write!(f, ";")?;
+ }
+ if let Some(cp) = self.simple_titlecase_mapping {
+ write!(f, "{}", cp)?;
+ }
+ Ok(())
+ }
+}
+
+/// Represents a decomposition mapping of a single row in the
+/// `UnicodeData.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct UnicodeDataDecomposition {
+ /// The formatting tag associated with this mapping, if present.
+ pub tag: Option<UnicodeDataDecompositionTag>,
+ /// The number of codepoints in this mapping.
+ pub len: usize,
+ /// The codepoints in the mapping. Entries beyond `len` in the mapping
+ /// are always U+0000. If no mapping was present, then this always contains
+ /// a single codepoint corresponding to this row's character.
+ pub mapping: [Codepoint; 18],
+}
+
+impl UnicodeDataDecomposition {
+ /// Create a new decomposition mapping with the given tag and codepoints.
+ ///
+ /// If there are too many codepoints, then an error is returned.
+ pub fn new(
+ tag: Option<UnicodeDataDecompositionTag>,
+ mapping: &[Codepoint],
+ ) -> Result<UnicodeDataDecomposition, Error> {
+ let mut x = UnicodeDataDecomposition::default();
+ x.tag = tag;
+ for &cp in mapping {
+ x.push(cp)?;
+ }
+ Ok(x)
+ }
+
+ /// Add a new codepoint to this decomposition's mapping.
+ ///
+ /// If the mapping is already full, then this returns an error.
+ pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
+ if self.len >= self.mapping.len() {
+ return err!(
+ "invalid decomposition mapping (too many codepoints)"
+ );
+ }
+ self.mapping[self.len] = cp;
+ self.len += 1;
+ Ok(())
+ }
+
+ /// Return the mapping as a slice of codepoints. The slice returned
+ /// has length equivalent to the number of codepoints in this mapping.
+ pub fn mapping(&self) -> &[Codepoint] {
+ &self.mapping[..self.len]
+ }
+
+ /// Returns true if and only if this decomposition mapping is canonical.
+ pub fn is_canonical(&self) -> bool {
+ self.tag.is_none()
+ }
+}
+
+impl FromStr for UnicodeDataDecomposition {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
+ lazy_static! {
+ static ref WITH_TAG: Regex = Regex::new(
+ r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$"
+ )
+ .unwrap();
+ static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap();
+ };
+ if s.is_empty() {
+ return err!(
+ "expected non-empty string for \
+ UnicodeDataDecomposition value"
+ );
+ }
+ let caps = match WITH_TAG.captures(s) {
+ Some(caps) => caps,
+ None => return err!("invalid decomposition value"),
+ };
+ let mut decomp = UnicodeDataDecomposition::default();
+ let mut codepoints = s;
+ if let Some(m) = caps.name("tag") {
+ decomp.tag = Some(m.as_str().parse()?);
+ codepoints = &caps["chars"];
+ }
+ for m in CHARS.find_iter(codepoints) {
+ let cp = m.as_str().parse()?;
+ decomp.push(cp)?;
+ }
+ Ok(decomp)
+ }
+}
+
+impl fmt::Display for UnicodeDataDecomposition {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ if let Some(ref tag) = self.tag {
+ write!(f, "<{}> ", tag)?;
+ }
+ let mut first = true;
+ for cp in self.mapping() {
+ if !first {
+ write!(f, " ")?;
+ }
+ first = false;
+ write!(f, "{}", cp)?;
+ }
+ Ok(())
+ }
+}
+
+/// The formatting tag on a decomposition mapping.
+///
+/// This is taken from
+/// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum UnicodeDataDecompositionTag {
+ /// <font>
+ Font,
+ /// <noBreak>
+ NoBreak,
+ /// <initial>
+ Initial,
+ /// <medial>
+ Medial,
+ /// <final>
+ Final,
+ /// <isolated>
+ Isolated,
+ /// <circle>
+ Circle,
+ /// <super>
+ Super,
+ /// <sub>
+ Sub,
+ /// <vertical>
+ Vertical,
+ /// <wide>
+ Wide,
+ /// <narrow>
+ Narrow,
+ /// <small>
+ Small,
+ /// <square>
+ Square,
+ /// <fraction>
+ Fraction,
+ /// <compat>
+ Compat,
+}
+
+impl FromStr for UnicodeDataDecompositionTag {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
+ use self::UnicodeDataDecompositionTag::*;
+ Ok(match s {
+ "font" => Font,
+ "noBreak" => NoBreak,
+ "initial" => Initial,
+ "medial" => Medial,
+ "final" => Final,
+ "isolated" => Isolated,
+ "circle" => Circle,
+ "super" => Super,
+ "sub" => Sub,
+ "vertical" => Vertical,
+ "wide" => Wide,
+ "narrow" => Narrow,
+ "small" => Small,
+ "square" => Square,
+ "fraction" => Fraction,
+ "compat" => Compat,
+ _ => return err!("invalid decomposition formatting tag: {}", s),
+ })
+ }
+}
+
+impl fmt::Display for UnicodeDataDecompositionTag {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use self::UnicodeDataDecompositionTag::*;
+ let s = match *self {
+ Font => "font",
+ NoBreak => "noBreak",
+ Initial => "initial",
+ Medial => "medial",
+ Final => "final",
+ Isolated => "isolated",
+ Circle => "circle",
+ Super => "super",
+ Sub => "sub",
+ Vertical => "vertical",
+ Wide => "wide",
+ Narrow => "narrow",
+ Small => "small",
+ Square => "square",
+ Fraction => "fraction",
+ Compat => "compat",
+ };
+ write!(f, "{}", s)
+ }
+}
+
+/// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
+///
+/// A numeric value can either be a signed integer or a rational number.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum UnicodeDataNumeric {
+ /// An integer.
+ Integer(i64),
+ /// A rational number. The first is the numerator and the latter is the
+ /// denominator.
+ Rational(i64, i64),
+}
+
+impl FromStr for UnicodeDataNumeric {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
+ if s.is_empty() {
+ return err!(
+ "expected non-empty string for UnicodeDataNumeric value"
+ );
+ }
+ if let Some(pos) = s.find('/') {
+ let (snum, sden) = (&s[..pos], &s[pos + 1..]);
+ let num = match snum.parse() {
+ Ok(num) => num,
+ Err(err) => {
+ return err!(
+ "invalid integer numerator '{}': {}",
+ snum,
+ err
+ );
+ }
+ };
+ let den = match sden.parse() {
+ Ok(den) => den,
+ Err(err) => {
+ return err!(
+ "invalid integer denominator '{}': {}",
+ sden,
+ err
+ );
+ }
+ };
+ Ok(UnicodeDataNumeric::Rational(num, den))
+ } else {
+ match s.parse() {
+ Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
+ Err(err) => {
+ return err!(
+ "invalid integer denominator '{}': {}",
+ s,
+ err
+ );
+ }
+ }
+ }
+ }
+}
+
+impl fmt::Display for UnicodeDataNumeric {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ UnicodeDataNumeric::Integer(n) => write!(f, "{}", n),
+ UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d),
+ }
+ }
+}
+
+/// An iterator adapter that expands rows in `UnicodeData.txt`.
+///
+/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
+/// represented. Instead, they are represented by a pair of rows, indicating
+/// a range of codepoints with the same properties. For example, the Hangul
+/// syllable codepoints are represented by these two rows:
+///
+/// ```ignore
+/// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+/// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+/// ```
+///
+/// This iterator will wrap any iterator of `UnicodeData` and, when a range of
+/// Unicode codepoints is found, it will be expanded to the appropriate
+/// sequence of `UnicodeData` values. Note that all such expanded records will
+/// have an empty name.
+pub struct UnicodeDataExpander<I: Iterator> {
+ /// The underlying iterator.
+ it: iter::Peekable<I>,
+ /// A range of codepoints to emit when we've found a pair. Otherwise,
+ /// `None`.
+ range: CodepointRange,
+}
+
+struct CodepointRange {
+ /// The codepoint range.
+ range: Range<u32>,
+ /// The start record. All subsequent records in this range are generated
+ /// by cloning this and updating the codepoint/name.
+ start_record: UnicodeData,
+}
+
+impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
+ /// Create a new iterator that expands pairs of `UnicodeData` range
+ /// records. All other records are passed through as-is.
+ pub fn new<T>(it: T) -> UnicodeDataExpander<I>
+ where
+ T: IntoIterator<IntoIter = I, Item = I::Item>,
+ {
+ UnicodeDataExpander {
+ it: it.into_iter().peekable(),
+ range: CodepointRange {
+ range: 0..0,
+ start_record: UnicodeData::default(),
+ },
+ }
+ }
+}
+
+impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
+ type Item = UnicodeData;
+
+ fn next(&mut self) -> Option<UnicodeData> {
+ if let Some(udata) = self.range.next() {
+ return Some(udata);
+ }
+ let row1 = match self.it.next() {
+ None => return None,
+ Some(row1) => row1,
+ };
+ if !row1.is_range_start()
+ || !self.it.peek().map_or(false, |row2| row2.is_range_end())
+ {
+ return Some(row1);
+ }
+ let row2 = self.it.next().unwrap();
+ self.range = CodepointRange {
+ range: row1.codepoint.value()..(row2.codepoint.value() + 1),
+ start_record: row1,
+ };
+ self.next()
+ }
+}
+
+impl Iterator for CodepointRange {
+ type Item = UnicodeData;
+
+ fn next(&mut self) -> Option<UnicodeData> {
+ let cp = match self.range.next() {
+ None => return None,
+ Some(cp) => cp,
+ };
+ Some(UnicodeData {
+ codepoint: Codepoint::from_u32(cp).unwrap(),
+ name: "".to_string(),
+ ..self.start_record.clone()
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::common::Codepoint;
+
+ use super::{
+ UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
+ UnicodeDataNumeric,
+ };
+
+ fn codepoint(n: u32) -> Codepoint {
+ Codepoint::from_u32(n).unwrap()
+ }
+
+ fn s(string: &str) -> String {
+ string.to_string()
+ }
+
+ #[test]
+ fn parse1() {
+ let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x249d),
+ name: s("PARENTHESIZED LATIN SMALL LETTER B"),
+ general_category: s("So"),
+ canonical_combining_class: 0,
+ bidi_class: s("L"),
+ decomposition: UnicodeDataDecomposition::new(
+ Some(UnicodeDataDecompositionTag::Compat),
+ &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: None,
+ bidi_mirrored: false,
+ unicode1_name: s(""),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: None,
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn parse2() {
+ let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x000D),
+ name: s("<control>"),
+ general_category: s("Cc"),
+ canonical_combining_class: 0,
+ bidi_class: s("B"),
+ decomposition: UnicodeDataDecomposition::new(
+ None,
+ &[codepoint(0x000D)]
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: None,
+ bidi_mirrored: false,
+ unicode1_name: s("CARRIAGE RETURN (CR)"),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: None,
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn parse3() {
+ let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x00BC),
+ name: s("VULGAR FRACTION ONE QUARTER"),
+ general_category: s("No"),
+ canonical_combining_class: 0,
+ bidi_class: s("ON"),
+ decomposition: UnicodeDataDecomposition::new(
+ Some(UnicodeDataDecompositionTag::Fraction),
+ &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
+ bidi_mirrored: false,
+ unicode1_name: s("FRACTION ONE QUARTER"),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: None,
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn parse4() {
+ let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x0041),
+ name: s("LATIN CAPITAL LETTER A"),
+ general_category: s("Lu"),
+ canonical_combining_class: 0,
+ bidi_class: s("L"),
+ decomposition: UnicodeDataDecomposition::new(
+ None,
+ &[codepoint(0x0041)]
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: None,
+ bidi_mirrored: false,
+ unicode1_name: s(""),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: Some(codepoint(0x0061)),
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn parse5() {
+ let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
+ let data: UnicodeData = line.parse().unwrap();
+ assert_eq!(
+ data,
+ UnicodeData {
+ codepoint: codepoint(0x0F33),
+ name: s("TIBETAN DIGIT HALF ZERO"),
+ general_category: s("No"),
+ canonical_combining_class: 0,
+ bidi_class: s("L"),
+ decomposition: UnicodeDataDecomposition::new(
+ None,
+ &[codepoint(0x0F33)]
+ )
+ .unwrap(),
+ numeric_type_decimal: None,
+ numeric_type_digit: None,
+ numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
+ -1, 2
+ )),
+ bidi_mirrored: false,
+ unicode1_name: s(""),
+ iso_comment: s(""),
+ simple_uppercase_mapping: None,
+ simple_lowercase_mapping: None,
+ simple_titlecase_mapping: None,
+ }
+ );
+ }
+
+ #[test]
+ fn expander() {
+ use super::UnicodeDataExpander;
+ use crate::common::UcdLineParser;
+
+ let data = "\
+ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
+";
+ let records = UcdLineParser::new(None, data.as_bytes())
+ .collect::<Result<Vec<_>, _>>()
+ .unwrap();
+ assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
+ }
+}