From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 17 Apr 2024 14:02:58 +0200
Subject: Adding upstream version 1.64.0+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 vendor/ucd-parse/src/age.rs                    |  59 ++
 vendor/ucd-parse/src/arabic_shaping.rs         | 184 ++++++
 vendor/ucd-parse/src/bidi_mirroring_glyph.rs   | 107 ++++
 vendor/ucd-parse/src/case_folding.rs           | 161 +++++
 vendor/ucd-parse/src/common.rs                 | 594 +++++++++++++++++++
 vendor/ucd-parse/src/core_properties.rs        |  60 ++
 vendor/ucd-parse/src/emoji_properties.rs       |  86 +++
 vendor/ucd-parse/src/error.rs                  |  86 +++
 vendor/ucd-parse/src/grapheme_cluster_break.rs |  98 +++
 vendor/ucd-parse/src/jamo_short_name.rs        |  80 +++
 vendor/ucd-parse/src/lib.rs                    |  66 +++
 vendor/ucd-parse/src/line_break.rs             |  49 ++
 vendor/ucd-parse/src/name_aliases.rs           | 145 +++++
 vendor/ucd-parse/src/prop_list.rs              |  63 ++
 vendor/ucd-parse/src/property_aliases.rs       | 113 ++++
 vendor/ucd-parse/src/property_value_aliases.rs | 185 ++++++
 vendor/ucd-parse/src/script_extensions.rs      |  68 +++
 vendor/ucd-parse/src/scripts.rs                |  59 ++
 vendor/ucd-parse/src/sentence_break.rs         | 101 ++++
 vendor/ucd-parse/src/special_casing.rs         | 112 ++++
 vendor/ucd-parse/src/unicode_data.rs           | 787 +++++++++++++++++++++++++
 vendor/ucd-parse/src/word_break.rs             | 103 ++++
 22 files changed, 3366 insertions(+)
 create mode 100644 vendor/ucd-parse/src/age.rs
 create mode 100644 vendor/ucd-parse/src/arabic_shaping.rs
 create mode 100644 vendor/ucd-parse/src/bidi_mirroring_glyph.rs
 create mode 100644 vendor/ucd-parse/src/case_folding.rs
 create mode 100644 vendor/ucd-parse/src/common.rs
 create mode 100644 vendor/ucd-parse/src/core_properties.rs
 create mode 100644 vendor/ucd-parse/src/emoji_properties.rs
 create mode 100644 vendor/ucd-parse/src/error.rs
 create mode 100644 vendor/ucd-parse/src/grapheme_cluster_break.rs
 create mode 100644 vendor/ucd-parse/src/jamo_short_name.rs
 create mode 100644 vendor/ucd-parse/src/lib.rs
 create mode 100644 vendor/ucd-parse/src/line_break.rs
 create mode 100644 vendor/ucd-parse/src/name_aliases.rs
 create mode 100644 vendor/ucd-parse/src/prop_list.rs
 create mode 100644 vendor/ucd-parse/src/property_aliases.rs
 create mode 100644 vendor/ucd-parse/src/property_value_aliases.rs
 create mode 100644 vendor/ucd-parse/src/script_extensions.rs
 create mode 100644 vendor/ucd-parse/src/scripts.rs
 create mode 100644 vendor/ucd-parse/src/sentence_break.rs
 create mode 100644 vendor/ucd-parse/src/special_casing.rs
 create mode 100644 vendor/ucd-parse/src/unicode_data.rs
 create mode 100644 vendor/ucd-parse/src/word_break.rs

(limited to 'vendor/ucd-parse/src')

diff --git a/vendor/ucd-parse/src/age.rs b/vendor/ucd-parse/src/age.rs
new file mode 100644
index 000000000..3c93f0707
--- /dev/null
+++ b/vendor/ucd-parse/src/age.rs
@@ -0,0 +1,59 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+    parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+    UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `DerivedAge.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Age {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The age assigned to the codepoints in this entry.
+    pub age: String,
+}
+
+impl UcdFile for Age {
+    fn relative_file_path() -> &'static Path {
+        Path::new("DerivedAge.txt")
+    }
+}
+
+impl UcdFileByCodepoint for Age {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for Age {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<Age, Error> {
+        let (codepoints, script) = parse_codepoint_association(line)?;
+        Ok(Age { codepoints, age: script.to_string() })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Age;
+
+    #[test]
+    fn parse_single() {
+        let line = "2BD2          ; 10.0 #       GROUP MARK\n";
+        let row: Age = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x2BD2);
+        assert_eq!(row.age, "10.0");
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "11D0B..11D36  ; 10.0 #  [44] MASARAM GONDI LETTER AU..MASARAM GONDI VOWEL SIGN VOCALIC R\n";
+        let row: Age = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0x11D0B, 0x11D36));
+        assert_eq!(row.age, "10.0");
+    }
+}
diff --git a/vendor/ucd-parse/src/arabic_shaping.rs b/vendor/ucd-parse/src/arabic_shaping.rs
new file mode 100644
index 000000000..d1d942a82
--- /dev/null
+++ b/vendor/ucd-parse/src/arabic_shaping.rs
@@ -0,0 +1,184 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// Represents a single row in the `ArabicShaping.txt` file.
+///
+/// The field names were taken from the header of ArabicShaping.txt.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct ArabicShaping {
+    /// The codepoint corresponding to this row.
+    pub codepoint: Codepoint,
+    /// A short schematic name for the codepoint.
+    ///
+    /// The schematic name is descriptive of the shape, based as consistently as
+    /// possible on a name for the skeleton and then the diacritic marks applied
+    /// to the skeleton, if any.  Note that this schematic name is considered a
+    /// comment, and does not constitute a formal property value.
+    pub schematic_name: String,
+    /// The "joining type" of this codepoint.
+    pub joining_type: JoiningType,
+    /// The "joining group" of this codepoint.
+    pub joining_group: String,
+}
+
+/// The Joining_Type field read from ArabicShaping.txt
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum JoiningType {
+    RightJoining,
+    LeftJoining,
+    DualJoining,
+    JoinCausing,
+    NonJoining,
+    Transparent,
+}
+
+impl JoiningType {
+    pub fn as_str(&self) -> &str {
+        match self {
+            JoiningType::RightJoining => "R",
+            JoiningType::LeftJoining => "L",
+            JoiningType::DualJoining => "D",
+            JoiningType::JoinCausing => "C",
+            JoiningType::NonJoining => "U",
+            JoiningType::Transparent => "T",
+        }
+    }
+}
+
+impl Default for JoiningType {
+    fn default() -> JoiningType {
+        JoiningType::NonJoining
+    }
+}
+
+impl FromStr for JoiningType {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<JoiningType, Error> {
+        match s {
+            "R" => Ok(JoiningType::RightJoining),
+            "L" => Ok(JoiningType::LeftJoining),
+            "D" => Ok(JoiningType::DualJoining),
+            "C" => Ok(JoiningType::JoinCausing),
+            "U" => Ok(JoiningType::NonJoining),
+            "T" => Ok(JoiningType::Transparent),
+            _ => err!(
+                "unrecognized joining type: '{}' \
+                 (must be one of R, L, D, C, U or T)",
+                s
+            ),
+        }
+    }
+}
+
+impl UcdFile for ArabicShaping {
+    fn relative_file_path() -> &'static Path {
+        Path::new("ArabicShaping.txt")
+    }
+}
+
+impl UcdFileByCodepoint for ArabicShaping {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoint.into_iter()
+    }
+}
+
+impl FromStr for ArabicShaping {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<ArabicShaping, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                \s*(?P<codepoint>[A-F0-9]+)\s*;
+                \s*(?P<name>[^;]+)\s*;
+                \s*(?P<joining_type>[^;]+)\s*;
+                \s*(?P<joining_group>[^;]+)
+                $
+                "
+            )
+            .unwrap();
+        };
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid ArabicShaping line"),
+        };
+
+        Ok(ArabicShaping {
+            codepoint: caps["codepoint"].parse()?,
+            schematic_name: caps["name"].to_string(),
+            joining_type: caps["joining_type"].parse()?,
+            joining_group: caps["joining_group"].to_string(),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::common::Codepoint;
+
+    use super::{ArabicShaping, JoiningType};
+
+    fn codepoint(n: u32) -> Codepoint {
+        Codepoint::from_u32(n).unwrap()
+    }
+
+    fn s(string: &str) -> String {
+        string.to_string()
+    }
+
+    #[test]
+    fn parse1() {
+        let line = "0600; ARABIC NUMBER SIGN; U; No_Joining_Group\n";
+        let data: ArabicShaping = line.parse().unwrap();
+        assert_eq!(
+            data,
+            ArabicShaping {
+                codepoint: codepoint(0x0600),
+                schematic_name: s("ARABIC NUMBER SIGN"),
+                joining_type: JoiningType::NonJoining,
+                joining_group: s("No_Joining_Group")
+            }
+        );
+    }
+
+    #[test]
+    fn parse2() {
+        let line = "063D; FARSI YEH WITH INVERTED V ABOVE; D; FARSI YEH\n";
+        let data: ArabicShaping = line.parse().unwrap();
+        assert_eq!(
+            data,
+            ArabicShaping {
+                codepoint: codepoint(0x063D),
+                schematic_name: s("FARSI YEH WITH INVERTED V ABOVE"),
+                joining_type: JoiningType::DualJoining,
+                joining_group: s("FARSI YEH")
+            }
+        );
+    }
+
+    #[test]
+    fn parse3() {
+        let line =
+            "10D23; HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE; D; HANIFI ROHINGYA KINNA YA\n";
+        let data: ArabicShaping = line.parse().unwrap();
+        assert_eq!(
+            data,
+            ArabicShaping {
+                codepoint: codepoint(0x10D23),
+                schematic_name: s(
+                    "HANIFI ROHINGYA DOTLESS KINNA YA WITH DOT ABOVE"
+                ),
+                joining_type: JoiningType::DualJoining,
+                joining_group: s("HANIFI ROHINGYA KINNA YA")
+            }
+        );
+    }
+}
diff --git a/vendor/ucd-parse/src/bidi_mirroring_glyph.rs b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs
new file mode 100644
index 000000000..fcfefffcb
--- /dev/null
+++ b/vendor/ucd-parse/src/bidi_mirroring_glyph.rs
@@ -0,0 +1,107 @@
+use std::fmt;
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// Represents a single row in the `BidiMirroring.txt` file.
+///
+/// The field names were taken from the header of BidiMirroring.txt.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct BidiMirroring {
+    /// The codepoint corresponding to this row.
+    pub codepoint: Codepoint,
+    /// The codepoint that has typically has a glyph that is the mirror image
+    /// of `codepoint`.
+    pub bidi_mirroring_glyph: Codepoint,
+}
+
+impl UcdFile for BidiMirroring {
+    fn relative_file_path() -> &'static Path {
+        Path::new("BidiMirroring.txt")
+    }
+}
+
+impl UcdFileByCodepoint for BidiMirroring {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoint.into_iter()
+    }
+}
+
+impl FromStr for BidiMirroring {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<BidiMirroring, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                \s*(?P<codepoint>[A-F0-9]+)\s*;
+                \s*(?P<substitute_codepoint>[A-F0-9]+)
+                \s+
+                \#(?:.+)
+                $
+                "
+            )
+            .unwrap();
+        };
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid BidiMirroring line"),
+        };
+
+        Ok(BidiMirroring {
+            codepoint: caps["codepoint"].parse()?,
+            bidi_mirroring_glyph: caps["substitute_codepoint"].parse()?,
+        })
+    }
+}
+
+impl fmt::Display for BidiMirroring {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{};", self.codepoint)?;
+        write!(f, "{};", self.bidi_mirroring_glyph)?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::common::Codepoint;
+
+    use super::BidiMirroring;
+
+    fn codepoint(n: u32) -> Codepoint {
+        Codepoint::from_u32(n).unwrap()
+    }
+
+    #[test]
+    fn parse() {
+        let line = "0028; 0029 # LEFT PARENTHESIS\n";
+        let data: BidiMirroring = line.parse().unwrap();
+        assert_eq!(
+            data,
+            BidiMirroring {
+                codepoint: codepoint(0x0028),
+                bidi_mirroring_glyph: codepoint(0x0029),
+            }
+        );
+    }
+
+    #[test]
+    fn parse_best_fit() {
+        let line = "228A; 228B # [BEST FIT] SUBSET OF WITH NOT EQUAL TO\n";
+        let data: BidiMirroring = line.parse().unwrap();
+        assert_eq!(
+            data,
+            BidiMirroring {
+                codepoint: codepoint(0x228A),
+                bidi_mirroring_glyph: codepoint(0x228B),
+            }
+        );
+    }
+}
diff --git a/vendor/ucd-parse/src/case_folding.rs b/vendor/ucd-parse/src/case_folding.rs
new file mode 100644
index 000000000..813fc81a1
--- /dev/null
+++ b/vendor/ucd-parse/src/case_folding.rs
@@ -0,0 +1,161 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// A single row in the `CaseFolding.txt` file.
+///
+/// The contents of `CaseFolding.txt` are a convenience derived from both
+/// `UnicodeData.txt` and `SpecialCasing.txt`.
+///
+/// Note that a single codepoint may be mapped multiple times. In particular,
+/// a single codepoint might have distinct `CaseStatus::Simple` and
+/// `CaseStatus::Full` mappings.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct CaseFold {
+    /// The codepoint that is being mapped.
+    pub codepoint: Codepoint,
+    /// The case status of this mapping.
+    pub status: CaseStatus,
+    /// The actual case mapping, which is more than one codepoint if this is
+    /// a "full" mapping.
+    pub mapping: Vec<Codepoint>,
+}
+
+impl UcdFile for CaseFold {
+    fn relative_file_path() -> &'static Path {
+        Path::new("CaseFolding.txt")
+    }
+}
+
+impl UcdFileByCodepoint for CaseFold {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoint.into_iter()
+    }
+}
+
+impl FromStr for CaseFold {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<CaseFold, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                \s*(?P<codepoint>[^\s;]+)\s*;
+                \s*(?P<status>[^\s;]+)\s*;
+                \s*(?P<mapping>[^;]+)\s*;
+                "
+            )
+            .unwrap();
+        };
+
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid CaseFolding line: '{}'", line),
+        };
+        let mut mapping = vec![];
+        for cp in caps["mapping"].split_whitespace() {
+            mapping.push(cp.parse()?);
+        }
+        Ok(CaseFold {
+            codepoint: caps["codepoint"].parse()?,
+            status: caps["status"].parse()?,
+            mapping,
+        })
+    }
+}
+
+/// The status of a particular case mapping.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum CaseStatus {
+    /// Case mappings shared by both "simple" and "full" mappings.
+    Common,
+    /// A case mapping that changes the number of codepoints.
+    Full,
+    /// A case mapping that doesn't change the number of codepoints, when it
+    /// differs from `Full`.
+    Simple,
+    /// Special cases (currently only for Turkic mappings) that are typically
+    /// excluded by default. Special cases don't change the number of
+    /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
+    Special,
+}
+
+impl Default for CaseStatus {
+    fn default() -> CaseStatus {
+        CaseStatus::Common
+    }
+}
+
+impl CaseStatus {
+    /// Returns true if and only if this status indicates a case mapping that
+    /// won't change the number of codepoints.
+    pub fn is_fixed(&self) -> bool {
+        *self != CaseStatus::Full
+    }
+}
+
+impl FromStr for CaseStatus {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<CaseStatus, Error> {
+        match s {
+            "C" => Ok(CaseStatus::Common),
+            "F" => Ok(CaseStatus::Full),
+            "S" => Ok(CaseStatus::Simple),
+            "T" => Ok(CaseStatus::Special),
+            _ => err!(
+                "unrecognized case status: '{}' \
+                 (must be one of C, F, S or T)",
+                s
+            ),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{CaseFold, CaseStatus};
+
+    #[test]
+    fn parse_common() {
+        let line =
+            "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n";
+        let row: CaseFold = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x0150);
+        assert_eq!(row.status, CaseStatus::Common);
+        assert_eq!(row.mapping, vec![0x0151]);
+    }
+
+    #[test]
+    fn parse_full() {
+        let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n";
+        let row: CaseFold = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x03B0);
+        assert_eq!(row.status, CaseStatus::Full);
+        assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]);
+    }
+
+    #[test]
+    fn parse_simple() {
+        let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n";
+        let row: CaseFold = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x1F8F);
+        assert_eq!(row.status, CaseStatus::Simple);
+        assert_eq!(row.mapping, vec![0x1F87]);
+    }
+
+    #[test]
+    fn parse_special() {
+        let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n";
+        let row: CaseFold = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x0049);
+        assert_eq!(row.status, CaseStatus::Special);
+        assert_eq!(row.mapping, vec![0x0131]);
+    }
+}
diff --git a/vendor/ucd-parse/src/common.rs b/vendor/ucd-parse/src/common.rs
new file mode 100644
index 000000000..c18be668e
--- /dev/null
+++ b/vendor/ucd-parse/src/common.rs
@@ -0,0 +1,594 @@
+use std::char;
+use std::collections::BTreeMap;
+use std::fmt;
+use std::fs::File;
+use std::io::{self, BufRead};
+use std::marker::PhantomData;
+use std::path::{Path, PathBuf};
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::error::{Error, ErrorKind};
+
+/// Parse a particular file in the UCD into a sequence of rows.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error>
+where
+    P: AsRef<Path>,
+    D: UcdFile,
+{
+    let mut xs = vec![];
+    for result in D::from_dir(ucd_dir)? {
+        let x = result?;
+        xs.push(x);
+    }
+    Ok(xs)
+}
+
+/// Parse a particular file in the UCD into a map from codepoint to the record.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse_by_codepoint<P, D>(
+    ucd_dir: P,
+) -> Result<BTreeMap<Codepoint, D>, Error>
+where
+    P: AsRef<Path>,
+    D: UcdFileByCodepoint,
+{
+    let mut map = BTreeMap::new();
+    for result in D::from_dir(ucd_dir)? {
+        let x = result?;
+        for cp in x.codepoints() {
+            map.insert(cp, x.clone());
+        }
+    }
+    Ok(map)
+}
+
+/// Parse a particular file in the UCD into a map from codepoint to all
+/// records associated with that codepoint.
+///
+/// This is useful for files that have multiple records for each codepoint.
+/// For example, the `NameAliases.txt` file lists multiple aliases for some
+/// codepoints.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse_many_by_codepoint<P, D>(
+    ucd_dir: P,
+) -> Result<BTreeMap<Codepoint, Vec<D>>, Error>
+where
+    P: AsRef<Path>,
+    D: UcdFileByCodepoint,
+{
+    let mut map = BTreeMap::new();
+    for result in D::from_dir(ucd_dir)? {
+        let x = result?;
+        for cp in x.codepoints() {
+            map.entry(cp).or_insert(vec![]).push(x.clone());
+        }
+    }
+    Ok(map)
+}
+
+/// Given a path pointing at the root of the `ucd_dir`, attempts to determine
+/// it's unicode version.
+///
+/// This just checks the readme and the very first line of PropList.txt -- in
+/// practice this works for all versions of UCD since 4.1.0.
+pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>(
+    ucd_dir: &D,
+) -> Result<(u64, u64, u64), Error> {
+    // Avoid duplication from generic path parameter.
+    fn ucd_directory_version_inner(
+        ucd_dir: &Path,
+    ) -> Result<(u64, u64, u64), Error> {
+        lazy_static::lazy_static! {
+            static ref VERSION_RX: Regex =
+                Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap();
+        }
+
+        let proplist = ucd_dir.join("PropList.txt");
+        let contents = first_line(&proplist)?;
+        let caps = match VERSION_RX.captures(&contents) {
+            Some(c) => c,
+            None => {
+                return err!("Failed to find version in line {:?}", contents)
+            }
+        };
+
+        let capture_to_num = |n| {
+            caps.get(n).unwrap().as_str().parse::<u64>().map_err(|e| Error {
+                kind: ErrorKind::Parse(format!(
+                    "Failed to parse version from {:?} in PropList.txt: {}",
+                    contents, e
+                )),
+                line: Some(0),
+                path: Some(proplist.clone()),
+            })
+        };
+        let major = capture_to_num(1)?;
+        let minor = capture_to_num(2)?;
+        let patch = capture_to_num(3)?;
+
+        Ok((major, minor, patch))
+    }
+    ucd_directory_version_inner(ucd_dir.as_ref())
+}
+
+fn first_line(path: &Path) -> Result<String, Error> {
+    let file = std::fs::File::open(path).map_err(|e| Error {
+        kind: ErrorKind::Io(e),
+        line: None,
+        path: Some(path.into()),
+    })?;
+
+    let mut reader = std::io::BufReader::new(file);
+    let mut line_contents = String::new();
+    reader.read_line(&mut line_contents).map_err(|e| Error {
+        kind: ErrorKind::Io(e),
+        line: None,
+        path: Some(path.into()),
+    })?;
+    Ok(line_contents)
+}
+
+/// A helper function for parsing a common record format that associates one
+/// or more codepoints with a string value.
+pub fn parse_codepoint_association<'a>(
+    line: &'a str,
+) -> Result<(Codepoints, &'a str), Error> {
+    lazy_static! {
+        static ref PARTS: Regex = Regex::new(
+            r"(?x)
+            ^
+            \s*(?P<codepoints>[^\s;]+)\s*;
+            \s*(?P<property>[^;\x23]+)\s*
+            "
+        )
+        .unwrap();
+    };
+
+    let caps = match PARTS.captures(line.trim()) {
+        Some(caps) => caps,
+        None => return err!("invalid PropList line: '{}'", line),
+    };
+    let property = match caps.name("property") {
+        Some(property) => property.as_str().trim(),
+        None => {
+            return err!(
+                "could not find property name in PropList line: '{}'",
+                line
+            )
+        }
+    };
+    Ok((caps["codepoints"].parse()?, property))
+}
+
+/// A helper function for parsing a sequence of space separated codepoints.
+/// The sequence is permitted to be empty.
+pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> {
+    let mut cps = vec![];
+    for cp in s.trim().split_whitespace() {
+        cps.push(cp.parse()?);
+    }
+    Ok(cps)
+}
+
+/// A helper function for parsing a single test for the various break
+/// algorithms.
+///
+/// Upon success, this returns the UTF-8 encoded groups of codepoints along
+/// with the comment associated with the test. The comment is a human readable
+/// description of the test that may prove useful for debugging.
+pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> {
+    lazy_static! {
+        static ref PARTS: Regex = Regex::new(
+            r"(?x)
+            ^
+            (?:÷|×)
+            (?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+)
+            \s+
+            \#(?P<comment>.+)
+            $
+            "
+        )
+        .unwrap();
+        static ref GROUP: Regex = Regex::new(
+            r"(?x)
+            (?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷|×)
+            "
+        )
+        .unwrap();
+    }
+
+    let caps = match PARTS.captures(line.trim()) {
+        Some(caps) => caps,
+        None => return err!("invalid break test line: '{}'", line),
+    };
+    let comment = caps["comment"].trim().to_string();
+
+    let mut groups = vec![];
+    let mut cur = String::new();
+    for cap in GROUP.captures_iter(&caps["groups"]) {
+        let cp: Codepoint = cap["codepoint"].parse()?;
+        let ch = match cp.scalar() {
+            Some(ch) => ch,
+            None => {
+                return err!(
+                    "invalid codepoint '{:X}' in line: '{}'",
+                    cp.value(),
+                    line
+                )
+            }
+        };
+        cur.push(ch);
+        if &cap["kind"] == "÷" {
+            groups.push(cur);
+            cur = String::new();
+        }
+    }
+    Ok((groups, comment))
+}
+
+/// Describes a single UCD file.
+pub trait UcdFile:
+    Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq
+{
+    /// The file path corresponding to this file, relative to the UCD
+    /// directory.
+    fn relative_file_path() -> &'static Path;
+
+    /// The full file path corresponding to this file given the UCD directory
+    /// path.
+    fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
+        ucd_dir.as_ref().join(Self::relative_file_path())
+    }
+
+    /// Create an iterator over each record in this UCD file.
+    ///
+    /// The parameter should correspond to the directory containing the UCD.
+    fn from_dir<P: AsRef<Path>>(
+        ucd_dir: P,
+    ) -> Result<UcdLineParser<File, Self>, Error> {
+        UcdLineParser::from_path(Self::file_path(ucd_dir))
+    }
+}
+
+/// Describes a single UCD file where every record in the file is associated
+/// with one or more codepoints.
+pub trait UcdFileByCodepoint: UcdFile {
+    /// Returns the codepoints associated with this record.
+    fn codepoints(&self) -> CodepointIter;
+}
+
+/// A line oriented parser for a particular UCD file.
+///
+/// Callers can build a line parser via the
+/// [`UcdFile::from_dir`](trait.UcdFile.html) method.
+///
+/// The `R` type parameter refers to the underlying `io::Read` implementation
+/// from which the UCD data is read.
+///
+/// The `D` type parameter refers to the type of the record parsed out of each
+/// line.
+#[derive(Debug)]
+pub struct UcdLineParser<R, D> {
+    path: Option<PathBuf>,
+    rdr: io::BufReader<R>,
+    line: String,
+    line_number: u64,
+    _data: PhantomData<D>,
+}
+
+impl<D> UcdLineParser<File, D> {
+    /// Create a new parser from the given file path.
+    pub(crate) fn from_path<P: AsRef<Path>>(
+        path: P,
+    ) -> Result<UcdLineParser<File, D>, Error> {
+        let path = path.as_ref();
+        let file = File::open(path).map_err(|e| Error {
+            kind: ErrorKind::Io(e),
+            line: None,
+            path: Some(path.to_path_buf()),
+        })?;
+        Ok(UcdLineParser::new(Some(path.to_path_buf()), file))
+    }
+}
+
+impl<R: io::Read, D> UcdLineParser<R, D> {
+    /// Create a new parser that parses the reader given.
+    ///
+    /// The type of data parsed is determined when the `parse_next` function
+    /// is called by virtue of the type requested.
+    ///
+    /// Note that the reader is buffered internally, so the caller does not
+    /// need to provide their own buffering.
+    pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> {
+        UcdLineParser {
+            path,
+            rdr: io::BufReader::new(rdr),
+            line: String::new(),
+            line_number: 0,
+            _data: PhantomData,
+        }
+    }
+}
+
+impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> {
+    type Item = Result<D, Error>;
+
+    fn next(&mut self) -> Option<Result<D, Error>> {
+        loop {
+            self.line_number += 1;
+            self.line.clear();
+            let n = match self.rdr.read_line(&mut self.line) {
+                Err(err) => {
+                    return Some(Err(Error {
+                        kind: ErrorKind::Io(err),
+                        line: None,
+                        path: self.path.clone(),
+                    }))
+                }
+                Ok(n) => n,
+            };
+            if n == 0 {
+                return None;
+            }
+            if !self.line.starts_with('#') && !self.line.trim().is_empty() {
+                break;
+            }
+        }
+        let line_number = self.line_number;
+        Some(self.line.parse().map_err(|mut err: Error| {
+            err.line = Some(line_number);
+            err
+        }))
+    }
+}
+
+/// A representation of either a single codepoint or a range of codepoints.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub enum Codepoints {
+    /// A single codepoint.
+    Single(Codepoint),
+    /// A range of codepoints.
+    Range(CodepointRange),
+}
+
+impl Default for Codepoints {
+    fn default() -> Codepoints {
+        Codepoints::Single(Codepoint::default())
+    }
+}
+
+impl IntoIterator for Codepoints {
+    type IntoIter = CodepointIter;
+    type Item = Codepoint;
+
+    fn into_iter(self) -> CodepointIter {
+        match self {
+            Codepoints::Single(x) => x.into_iter(),
+            Codepoints::Range(x) => x.into_iter(),
+        }
+    }
+}
+
+impl FromStr for Codepoints {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<Codepoints, Error> {
+        if s.contains("..") {
+            CodepointRange::from_str(s).map(Codepoints::Range)
+        } else {
+            Codepoint::from_str(s).map(Codepoints::Single)
+        }
+    }
+}
+
+impl fmt::Display for Codepoints {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            Codepoints::Single(ref x) => x.fmt(f),
+            Codepoints::Range(ref x) => x.fmt(f),
+        }
+    }
+}
+
+impl PartialEq<u32> for Codepoints {
+    fn eq(&self, other: &u32) -> bool {
+        match *self {
+            Codepoints::Single(ref x) => x == other,
+            Codepoints::Range(ref x) => x == &(*other, *other),
+        }
+    }
+}
+
+impl PartialEq<Codepoint> for Codepoints {
+    fn eq(&self, other: &Codepoint) -> bool {
+        match *self {
+            Codepoints::Single(ref x) => x == other,
+            Codepoints::Range(ref x) => x == &(*other, *other),
+        }
+    }
+}
+
+impl PartialEq<(u32, u32)> for Codepoints {
+    fn eq(&self, other: &(u32, u32)) -> bool {
+        match *self {
+            Codepoints::Single(ref x) => &(x.value(), x.value()) == other,
+            Codepoints::Range(ref x) => x == other,
+        }
+    }
+}
+
+impl PartialEq<(Codepoint, Codepoint)> for Codepoints {
+    fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
+        match *self {
+            Codepoints::Single(ref x) => &(*x, *x) == other,
+            Codepoints::Range(ref x) => x == other,
+        }
+    }
+}
+
+/// A range of Unicode codepoints. The range is inclusive; both ends of the
+/// range are guaranteed to be valid codepoints.
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub struct CodepointRange {
+    /// The start of the codepoint range.
+    pub start: Codepoint,
+    /// The end of the codepoint range.
+    pub end: Codepoint,
+}
+
+impl IntoIterator for CodepointRange {
+    type IntoIter = CodepointIter;
+    type Item = Codepoint;
+
+    fn into_iter(self) -> CodepointIter {
+        CodepointIter { next: self.start.value(), range: self }
+    }
+}
+
+impl FromStr for CodepointRange {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<CodepointRange, Error> {
+        lazy_static! {
+            static ref PARTS: Regex =
+                Regex::new(r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$")
+                    .unwrap();
+        }
+        let caps = match PARTS.captures(s) {
+            Some(caps) => caps,
+            None => return err!("invalid codepoint range: '{}'", s),
+        };
+        let start = caps["start"].parse().or_else(|err| {
+            err!("failed to parse '{}' as a codepoint range: {}", s, err)
+        })?;
+        let end = caps["end"].parse().or_else(|err| {
+            err!("failed to parse '{}' as a codepoint range: {}", s, err)
+        })?;
+        Ok(CodepointRange { start, end })
+    }
+}
+
+impl fmt::Display for CodepointRange {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}..{}", self.start, self.end)
+    }
+}
+
+impl PartialEq<(u32, u32)> for CodepointRange {
+    fn eq(&self, other: &(u32, u32)) -> bool {
+        &(self.start.value(), self.end.value()) == other
+    }
+}
+
+impl PartialEq<(Codepoint, Codepoint)> for CodepointRange {
+    fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
+        &(self.start, self.end) == other
+    }
+}
+
+/// A single Unicode codepoint.
+///
+/// This type's string representation is a hexadecimal number. It is guaranteed
+/// to be in the range `[0, 10FFFF]`.
+///
+/// Note that unlike Rust's `char` type, this may be a surrogate codepoint.
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub struct Codepoint(u32);
+
+impl Codepoint {
+    /// Create a new codepoint from a `u32`.
+    ///
+    /// If the given number is not a valid codepoint, then this returns an
+    /// error.
+    pub fn from_u32(n: u32) -> Result<Codepoint, Error> {
+        if n > 0x10FFFF {
+            err!("{:x} is not a valid Unicode codepoint", n)
+        } else {
+            Ok(Codepoint(n))
+        }
+    }
+
+    /// Return the underlying `u32` codepoint value.
+    pub fn value(self) -> u32 {
+        self.0
+    }
+
+    /// Attempt to convert this codepoint to a Unicode scalar value.
+    ///
+    /// If this is a surrogate codepoint, then this returns `None`.
+    pub fn scalar(self) -> Option<char> {
+        char::from_u32(self.0)
+    }
+}
+
+impl IntoIterator for Codepoint {
+    type IntoIter = CodepointIter;
+    type Item = Codepoint;
+
+    fn into_iter(self) -> CodepointIter {
+        let range = CodepointRange { start: self, end: self };
+        CodepointIter { next: self.value(), range }
+    }
+}
+
+impl FromStr for Codepoint {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<Codepoint, Error> {
+        match u32::from_str_radix(s, 16) {
+            Ok(n) => Codepoint::from_u32(n),
+            Err(err) => {
+                return err!(
+                    "failed to parse '{}' as a hexadecimal codepoint: {}",
+                    s,
+                    err
+                );
+            }
+        }
+    }
+}
+
+impl fmt::Display for Codepoint {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{:04X}", self.0)
+    }
+}
+
+impl PartialEq<u32> for Codepoint {
+    fn eq(&self, other: &u32) -> bool {
+        self.0 == *other
+    }
+}
+
+impl PartialEq<Codepoint> for u32 {
+    fn eq(&self, other: &Codepoint) -> bool {
+        *self == other.0
+    }
+}
+
+/// An iterator over a range of Unicode codepoints.
+#[derive(Debug)]
+pub struct CodepointIter {
+    next: u32,
+    range: CodepointRange,
+}
+
+impl Iterator for CodepointIter {
+    type Item = Codepoint;
+
+    fn next(&mut self) -> Option<Codepoint> {
+        if self.next > self.range.end.value() {
+            return None;
+        }
+        let current = self.next;
+        self.next += 1;
+        Some(Codepoint::from_u32(current).unwrap())
+    }
+}
diff --git a/vendor/ucd-parse/src/core_properties.rs b/vendor/ucd-parse/src/core_properties.rs
new file mode 100644
index 000000000..9a7682b43
--- /dev/null
+++ b/vendor/ucd-parse/src/core_properties.rs
@@ -0,0 +1,60 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+    parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+    UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `DerivedCoreProperties.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct CoreProperty {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The property name assigned to the codepoints in this entry.
+    pub property: String,
+}
+
+impl UcdFile for CoreProperty {
+    fn relative_file_path() -> &'static Path {
+        Path::new("DerivedCoreProperties.txt")
+    }
+}
+
+impl UcdFileByCodepoint for CoreProperty {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for CoreProperty {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<CoreProperty, Error> {
+        let (codepoints, property) = parse_codepoint_association(line)?;
+        Ok(CoreProperty { codepoints, property: property.to_string() })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::CoreProperty;
+
+    #[test]
+    fn parse_single() {
+        let line =
+            "1163D         ; Case_Ignorable # Mn       MODI SIGN ANUSVARA\n";
+        let row: CoreProperty = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x1163D);
+        assert_eq!(row.property, "Case_Ignorable");
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "11133..11134  ; Grapheme_Link # Mn   [2] CHAKMA VIRAMA..CHAKMA MAAYYAA\n";
+        let row: CoreProperty = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0x11133, 0x11134));
+        assert_eq!(row.property, "Grapheme_Link");
+    }
+}
diff --git a/vendor/ucd-parse/src/emoji_properties.rs b/vendor/ucd-parse/src/emoji_properties.rs
new file mode 100644
index 000000000..dc5c0c884
--- /dev/null
+++ b/vendor/ucd-parse/src/emoji_properties.rs
@@ -0,0 +1,86 @@
+use std::path::{Path, PathBuf};
+use std::str::FromStr;
+
+use crate::common::{
+    parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+    UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `emoji-data.txt` file.
+///
+/// The `emoji-data.txt` file is the source of truth on several Emoji-related
+/// Unicode properties.
+///
+/// Note that `emoji-data.txt` is not formally part of the Unicode Character
+/// Database. You can download the Emoji data files separately here:
+/// https://unicode.org/Public/emoji/
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct EmojiProperty {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The property name assigned to the codepoints in this entry.
+    pub property: String,
+}
+
+impl UcdFile for EmojiProperty {
+    fn relative_file_path() -> &'static Path {
+        Path::new("emoji/emoji-data.txt")
+    }
+
+    fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
+        let ucd_dir = ucd_dir.as_ref();
+        // The standard location, but only on UCDs from 13.0.0 and up.
+        let std = ucd_dir.join(Self::relative_file_path());
+        if std.exists() {
+            std
+        } else {
+            // If the old location does exist, use it.
+            let legacy = ucd_dir.join("emoji-data.txt");
+            if legacy.exists() {
+                legacy
+            } else {
+                // This might end up in an error message, so use the standard
+                // one if forced to choose. Arguably we could do something like
+                // peek
+                std
+            }
+        }
+    }
+}
+
+impl UcdFileByCodepoint for EmojiProperty {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for EmojiProperty {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<EmojiProperty, Error> {
+        let (codepoints, property) = parse_codepoint_association(line)?;
+        Ok(EmojiProperty { codepoints, property: property.to_string() })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::EmojiProperty;
+
+    #[test]
+    fn parse_single() {
+        let line = "24C2          ; Emoji                #  1.1  [1] (Ⓜ️)       circled M\n";
+        let row: EmojiProperty = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x24C2);
+        assert_eq!(row.property, "Emoji");
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "1FA6E..1FFFD  ; Extended_Pictographic#   NA[1424] (🩮️..🿽️)   <reserved-1FA6E>..<reserved-1FFFD>\n";
+        let row: EmojiProperty = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0x1FA6E, 0x1FFFD));
+        assert_eq!(row.property, "Extended_Pictographic");
+    }
+}
diff --git a/vendor/ucd-parse/src/error.rs b/vendor/ucd-parse/src/error.rs
new file mode 100644
index 000000000..9dafc4b33
--- /dev/null
+++ b/vendor/ucd-parse/src/error.rs
@@ -0,0 +1,86 @@
+use std::error;
+use std::fmt;
+use std::io;
+use std::path::{Path, PathBuf};
+
+/// Represents any kind of error that can occur while parsing the UCD.
+#[derive(Debug)]
+pub struct Error {
+    pub(crate) kind: ErrorKind,
+    pub(crate) line: Option<u64>,
+    pub(crate) path: Option<PathBuf>,
+}
+
+/// The kind of error that occurred while parsing the UCD.
+#[derive(Debug)]
+pub enum ErrorKind {
+    /// An I/O error.
+    Io(io::Error),
+    /// A generic parse error.
+    Parse(String),
+}
+
+impl Error {
+    /// Create a new parse error from the given message.
+    pub(crate) fn parse(msg: String) -> Error {
+        Error { kind: ErrorKind::Parse(msg), line: None, path: None }
+    }
+
+    /// Return the specific kind of this error.
+    pub fn kind(&self) -> &ErrorKind {
+        &self.kind
+    }
+
+    /// Return the line number at which this error occurred, if available.
+    pub fn line(&self) -> Option<u64> {
+        self.line
+    }
+
+    /// Return the file path associated with this error, if one exists.
+    pub fn path(&self) -> Option<&Path> {
+        self.path.as_ref().map(|p| &**p)
+    }
+
+    /// Unwrap this error into its underlying kind.
+    pub fn into_kind(self) -> ErrorKind {
+        self.kind
+    }
+
+    /// Returns true if and only if this is an I/O error.
+    ///
+    /// If this returns true, the underlying `ErrorKind` is guaranteed to be
+    /// `ErrorKind::Io`.
+    pub fn is_io_error(&self) -> bool {
+        match self.kind {
+            ErrorKind::Io(_) => true,
+            _ => false,
+        }
+    }
+}
+
+impl error::Error for Error {
+    fn cause(&self) -> Option<&dyn error::Error> {
+        match self.kind {
+            ErrorKind::Io(ref err) => Some(err),
+            _ => None,
+        }
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if let Some(ref path) = self.path {
+            if let Some(line) = self.line {
+                write!(f, "{}:{}: ", path.display(), line)?;
+            } else {
+                write!(f, "{}: ", path.display())?;
+            }
+        } else if let Some(line) = self.line {
+            write!(f, "error on line {}: ", line)?;
+        }
+        match self.kind {
+            ErrorKind::Io(ref err) => write!(f, "{}", err),
+            ErrorKind::Parse(ref msg) => write!(f, "{}", msg),
+        }
+    }
+}
diff --git a/vendor/ucd-parse/src/grapheme_cluster_break.rs b/vendor/ucd-parse/src/grapheme_cluster_break.rs
new file mode 100644
index 000000000..9dbf32f41
--- /dev/null
+++ b/vendor/ucd-parse/src/grapheme_cluster_break.rs
@@ -0,0 +1,98 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+    parse_break_test, parse_codepoint_association, CodepointIter, Codepoints,
+    UcdFile, UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `auxiliary/GraphemeBreakProperty.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct GraphemeClusterBreak {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The property value assigned to the codepoints in this entry.
+    pub value: String,
+}
+
+impl UcdFile for GraphemeClusterBreak {
+    fn relative_file_path() -> &'static Path {
+        Path::new("auxiliary/GraphemeBreakProperty.txt")
+    }
+}
+
+impl UcdFileByCodepoint for GraphemeClusterBreak {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for GraphemeClusterBreak {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<GraphemeClusterBreak, Error> {
+        let (codepoints, value) = parse_codepoint_association(line)?;
+        Ok(GraphemeClusterBreak { codepoints, value: value.to_string() })
+    }
+}
+
+/// A single row in the `auxiliary/GraphemeBreakTest.txt` file.
+///
+/// This file defines tests for the grapheme cluster break algorithm.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct GraphemeClusterBreakTest {
+    /// Each string is a UTF-8 encoded group of codepoints that make up a
+    /// single grapheme cluster.
+    pub grapheme_clusters: Vec<String>,
+    /// A human readable description of this test.
+    pub comment: String,
+}
+
+impl UcdFile for GraphemeClusterBreakTest {
+    fn relative_file_path() -> &'static Path {
+        Path::new("auxiliary/GraphemeBreakTest.txt")
+    }
+}
+
+impl FromStr for GraphemeClusterBreakTest {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<GraphemeClusterBreakTest, Error> {
+        let (groups, comment) = parse_break_test(line)?;
+        Ok(GraphemeClusterBreakTest { grapheme_clusters: groups, comment })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{GraphemeClusterBreak, GraphemeClusterBreakTest};
+
+    #[test]
+    fn parse_single() {
+        let line = "093B          ; SpacingMark # Mc       DEVANAGARI VOWEL SIGN OOE\n";
+        let row: GraphemeClusterBreak = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x093B);
+        assert_eq!(row.value, "SpacingMark");
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "1F1E6..1F1FF  ; Regional_Indicator # So  [26] REGIONAL INDICATOR SYMBOL LETTER A..REGIONAL INDICATOR SYMBOL LETTER Z\n";
+        let row: GraphemeClusterBreak = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0x1F1E6, 0x1F1FF));
+        assert_eq!(row.value, "Regional_Indicator");
+    }
+
+    #[test]
+    fn parse_test() {
+        let line = "÷ 0061 × 1F3FF ÷ 1F476 × 200D × 1F6D1 ÷	#  ÷ [0.2] LATIN SMALL LETTER A (Other) × [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) ÷ [999.0] BABY (ExtPict) × [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) × [11.0] OCTAGONAL SIGN (ExtPict) ÷ [0.3]\n";
+
+        let row: GraphemeClusterBreakTest = line.parse().unwrap();
+        assert_eq!(
+            row.grapheme_clusters,
+            vec!["\u{0061}\u{1F3FF}", "\u{1F476}\u{200D}\u{1F6D1}",]
+        );
+        assert!(row.comment.starts_with("÷ [0.2] LATIN SMALL LETTER A"));
+    }
+}
diff --git a/vendor/ucd-parse/src/jamo_short_name.rs b/vendor/ucd-parse/src/jamo_short_name.rs
new file mode 100644
index 000000000..4103dd7ee
--- /dev/null
+++ b/vendor/ucd-parse/src/jamo_short_name.rs
@@ -0,0 +1,80 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// A single row in the `Jamo.txt` file.
+///
+/// The `Jamo.txt` file defines the `Jamo_Short_Name` property.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct JamoShortName {
+    /// The codepoint corresponding to this row.
+    pub codepoint: Codepoint,
+    /// The actual "Jamo Short Name." This string contains at most 3 bytes and
+    /// may be empty.
+    pub name: String,
+}
+
+impl UcdFile for JamoShortName {
+    fn relative_file_path() -> &'static Path {
+        Path::new("Jamo.txt")
+    }
+}
+
+impl UcdFileByCodepoint for JamoShortName {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoint.into_iter()
+    }
+}
+
+impl FromStr for JamoShortName {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<JamoShortName, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                (?P<codepoint>[A-Z0-9]+);
+                \s*
+                (?P<name>[A-Z]*)
+                "
+            )
+            .unwrap();
+        };
+
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid Jamo_Short_name line"),
+        };
+        Ok(JamoShortName {
+            codepoint: caps["codepoint"].parse()?,
+            name: caps.name("name").unwrap().as_str().to_string(),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::JamoShortName;
+
+    #[test]
+    fn parse1() {
+        let line = "1164; YAE # HANGUL JUNGSEONG YAE\n";
+        let row: JamoShortName = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x1164);
+        assert_eq!(row.name, "YAE");
+    }
+
+    #[test]
+    fn parse2() {
+        let line = "110B;     # HANGUL CHOSEONG IEUNG\n";
+        let row: JamoShortName = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x110B);
+        assert_eq!(row.name, "");
+    }
+}
diff --git a/vendor/ucd-parse/src/lib.rs b/vendor/ucd-parse/src/lib.rs
new file mode 100644
index 000000000..f6654658a
--- /dev/null
+++ b/vendor/ucd-parse/src/lib.rs
@@ -0,0 +1,66 @@
+/*!
+A library for parsing the Unicode character database.
+*/
+
+#![deny(missing_docs)]
+
+pub use crate::common::{
+    parse, parse_by_codepoint, parse_many_by_codepoint, ucd_directory_version,
+    Codepoint, CodepointIter, CodepointRange, Codepoints, UcdFile,
+    UcdFileByCodepoint, UcdLineParser,
+};
+pub use crate::error::{Error, ErrorKind};
+
+pub use crate::age::Age;
+pub use crate::arabic_shaping::ArabicShaping;
+pub use crate::bidi_mirroring_glyph::BidiMirroring;
+pub use crate::case_folding::{CaseFold, CaseStatus};
+pub use crate::core_properties::CoreProperty;
+pub use crate::emoji_properties::EmojiProperty;
+pub use crate::grapheme_cluster_break::{
+    GraphemeClusterBreak, GraphemeClusterBreakTest,
+};
+pub use crate::jamo_short_name::JamoShortName;
+pub use crate::line_break::LineBreakTest;
+pub use crate::name_aliases::{NameAlias, NameAliasLabel};
+pub use crate::prop_list::Property;
+pub use crate::property_aliases::PropertyAlias;
+pub use crate::property_value_aliases::PropertyValueAlias;
+pub use crate::script_extensions::ScriptExtension;
+pub use crate::scripts::Script;
+pub use crate::sentence_break::{SentenceBreak, SentenceBreakTest};
+pub use crate::special_casing::SpecialCaseMapping;
+pub use crate::unicode_data::{
+    UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
+    UnicodeDataExpander, UnicodeDataNumeric,
+};
+pub use crate::word_break::{WordBreak, WordBreakTest};
+
+macro_rules! err {
+    ($($tt:tt)*) => {
+        Err(crate::error::Error::parse(format!($($tt)*)))
+    }
+}
+
+mod common;
+mod error;
+
+mod age;
+mod arabic_shaping;
+mod bidi_mirroring_glyph;
+mod case_folding;
+mod core_properties;
+mod emoji_properties;
+mod grapheme_cluster_break;
+mod jamo_short_name;
+mod line_break;
+mod name_aliases;
+mod prop_list;
+mod property_aliases;
+mod property_value_aliases;
+mod script_extensions;
+mod scripts;
+mod sentence_break;
+mod special_casing;
+mod unicode_data;
+mod word_break;
diff --git a/vendor/ucd-parse/src/line_break.rs b/vendor/ucd-parse/src/line_break.rs
new file mode 100644
index 000000000..aa62fcb9e
--- /dev/null
+++ b/vendor/ucd-parse/src/line_break.rs
@@ -0,0 +1,49 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{parse_break_test, UcdFile};
+use crate::error::Error;
+
+/// A single row in the `auxiliary/LineBreakTest.txt` file.
+///
+/// This file defines tests for the line break algorithm.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct LineBreakTest {
+    /// Each string is a UTF-8 encoded group of codepoints that make up a
+    /// single line.
+    pub lines: Vec<String>,
+    /// A human readable description of this test.
+    pub comment: String,
+}
+
+impl UcdFile for LineBreakTest {
+    fn relative_file_path() -> &'static Path {
+        Path::new("auxiliary/LineBreakTest.txt")
+    }
+}
+
+impl FromStr for LineBreakTest {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<LineBreakTest, Error> {
+        let (groups, comment) = parse_break_test(line)?;
+        Ok(LineBreakTest { lines: groups, comment })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::LineBreakTest;
+
+    #[test]
+    fn parse_test() {
+        let line = "× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷   #  × [0.3] REGIONAL INDICATOR SYMBOL LETTER R (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER U (RI) ÷ [30.13] REGIONAL INDICATOR SYMBOL LETTER S (RI) × [30.11] REGIONAL INDICATOR SYMBOL LETTER E (RI) ÷ [0.3]";
+
+        let row: LineBreakTest = line.parse().unwrap();
+        assert_eq!(
+            row.lines,
+            vec!["\u{1F1F7}\u{1F1FA}", "\u{1F1F8}\u{1F1EA}",]
+        );
+        assert!(row.comment.ends_with("(RI) ÷ [0.3]"));
+    }
+}
diff --git a/vendor/ucd-parse/src/name_aliases.rs b/vendor/ucd-parse/src/name_aliases.rs
new file mode 100644
index 000000000..36c9c4b01
--- /dev/null
+++ b/vendor/ucd-parse/src/name_aliases.rs
@@ -0,0 +1,145 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// A single row in the `NameAliases.txt` file.
+///
+/// Note that there are multiple rows for some codepoint. Each row provides a
+/// new alias.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct NameAlias {
+    /// The codepoint corresponding to this row.
+    pub codepoint: Codepoint,
+    /// The alias.
+    pub alias: String,
+    /// The label of this alias.
+    pub label: NameAliasLabel,
+}
+
+impl UcdFile for NameAlias {
+    fn relative_file_path() -> &'static Path {
+        Path::new("NameAliases.txt")
+    }
+}
+
+impl UcdFileByCodepoint for NameAlias {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoint.into_iter()
+    }
+}
+
+impl FromStr for NameAlias {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<NameAlias, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                (?P<codepoint>[A-Z0-9]+);
+                \s*
+                (?P<alias>[^;]+);
+                \s*
+                (?P<label>\S+)
+                "
+            )
+            .unwrap();
+        };
+
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid NameAliases line"),
+        };
+        Ok(NameAlias {
+            codepoint: caps["codepoint"].parse()?,
+            alias: caps.name("alias").unwrap().as_str().to_string(),
+            label: caps["label"].parse()?,
+        })
+    }
+}
+
+/// The label of a name alias.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum NameAliasLabel {
+    /// Corrections for serious problems in a character name.
+    Correction,
+    /// ISO 6429 names for C0 and C1 control functions and other commonly
+    /// occurring names for control codes.
+    Control,
+    /// A few widely used alternate names for format characters.
+    Alternate,
+    /// Several documented labels for C1 control code points which were
+    /// never actually approved in any standard.
+    Figment,
+    /// Commonly occurring abbreviations (or acronyms) for control codes,
+    /// format characters, spaces and variation selectors.
+    Abbreviation,
+}
+
+impl Default for NameAliasLabel {
+    fn default() -> NameAliasLabel {
+        // This is arbitrary, but the Default impl is convenient.
+        NameAliasLabel::Correction
+    }
+}
+
+impl FromStr for NameAliasLabel {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<NameAliasLabel, Error> {
+        match s {
+            "correction" => Ok(NameAliasLabel::Correction),
+            "control" => Ok(NameAliasLabel::Control),
+            "alternate" => Ok(NameAliasLabel::Alternate),
+            "figment" => Ok(NameAliasLabel::Figment),
+            "abbreviation" => Ok(NameAliasLabel::Abbreviation),
+            unknown => err!("unknown name alias label: '{}'", unknown),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{NameAlias, NameAliasLabel};
+
+    #[test]
+    fn parse1() {
+        let line = "0000;NULL;control\n";
+        let row: NameAlias = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x0);
+        assert_eq!(row.alias, "NULL");
+        assert_eq!(row.label, NameAliasLabel::Control);
+    }
+
+    #[test]
+    fn parse2() {
+        let line = "000B;VERTICAL TABULATION;control\n";
+        let row: NameAlias = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0xB);
+        assert_eq!(row.alias, "VERTICAL TABULATION");
+        assert_eq!(row.label, NameAliasLabel::Control);
+    }
+
+    #[test]
+    fn parse3() {
+        let line = "0081;HIGH OCTET PRESET;figment\n";
+        let row: NameAlias = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x81);
+        assert_eq!(row.alias, "HIGH OCTET PRESET");
+        assert_eq!(row.label, NameAliasLabel::Figment);
+    }
+
+    #[test]
+    fn parse4() {
+        let line = "E01EF;VS256;abbreviation\n";
+        let row: NameAlias = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0xE01EF);
+        assert_eq!(row.alias, "VS256");
+        assert_eq!(row.label, NameAliasLabel::Abbreviation);
+    }
+}
diff --git a/vendor/ucd-parse/src/prop_list.rs b/vendor/ucd-parse/src/prop_list.rs
new file mode 100644
index 000000000..db830c57a
--- /dev/null
+++ b/vendor/ucd-parse/src/prop_list.rs
@@ -0,0 +1,63 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+    parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+    UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `PropList.txt` file.
+///
+/// The `PropList.txt` file is the source of truth on several Unicode
+/// properties.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Property {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The property name assigned to the codepoints in this entry.
+    pub property: String,
+}
+
+impl UcdFile for Property {
+    fn relative_file_path() -> &'static Path {
+        Path::new("PropList.txt")
+    }
+}
+
+impl UcdFileByCodepoint for Property {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for Property {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<Property, Error> {
+        let (codepoints, property) = parse_codepoint_association(line)?;
+        Ok(Property { codepoints, property: property.to_string() })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Property;
+
+    #[test]
+    fn parse_single() {
+        let line =
+            "061C          ; Bidi_Control # Cf       ARABIC LETTER MARK\n";
+        let row: Property = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x061C);
+        assert_eq!(row.property, "Bidi_Control");
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "0009..000D    ; White_Space # Cc   [5] <control-0009>..<control-000D>\n";
+        let row: Property = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0x0009, 0x000D));
+        assert_eq!(row.property, "White_Space");
+    }
+}
diff --git a/vendor/ucd-parse/src/property_aliases.rs b/vendor/ucd-parse/src/property_aliases.rs
new file mode 100644
index 000000000..f94a116e6
--- /dev/null
+++ b/vendor/ucd-parse/src/property_aliases.rs
@@ -0,0 +1,113 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::UcdFile;
+use crate::error::Error;
+
+/// A single row in the `PropertyAliases.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct PropertyAlias {
+    /// An abbreviation for this property.
+    pub abbreviation: String,
+    /// The "long" name of this property.
+    pub long: String,
+    /// Additional aliases (if present).
+    pub aliases: Vec<String>,
+}
+
+impl UcdFile for PropertyAlias {
+    fn relative_file_path() -> &'static Path {
+        Path::new("PropertyAliases.txt")
+    }
+}
+
+impl FromStr for PropertyAlias {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<PropertyAlias, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                \s*(?P<abbrev>[^\s;]+)\s*;
+                \s*(?P<long>[^\s;]+)\s*
+                (?:;(?P<aliases>.*))?
+                "
+            )
+            .unwrap();
+            static ref ALIASES: Regex =
+                Regex::new(r"\s*(?P<alias>[^\s;]+)\s*;?\s*").unwrap();
+        };
+
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid PropertyAliases line: '{}'", line),
+        };
+        let mut aliases = vec![];
+        if let Some(m) = caps.name("aliases") {
+            for acaps in ALIASES.captures_iter(m.as_str()) {
+                let alias = acaps.name("alias").unwrap().as_str();
+                aliases.push(alias.to_string());
+            }
+        }
+        Ok(PropertyAlias {
+            abbreviation: caps.name("abbrev").unwrap().as_str().to_string(),
+            long: caps.name("long").unwrap().as_str().to_string(),
+            aliases,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::PropertyAlias;
+
+    #[test]
+    fn parse1() {
+        let line = "cjkAccountingNumeric     ; kAccountingNumeric\n";
+        let row: PropertyAlias = line.parse().unwrap();
+        assert_eq!(row.abbreviation, "cjkAccountingNumeric");
+        assert_eq!(row.long, "kAccountingNumeric");
+        assert!(row.aliases.is_empty());
+    }
+
+    #[test]
+    fn parse2() {
+        let line = "nv                       ; Numeric_Value\n";
+        let row: PropertyAlias = line.parse().unwrap();
+        assert_eq!(row.abbreviation, "nv");
+        assert_eq!(row.long, "Numeric_Value");
+        assert!(row.aliases.is_empty());
+    }
+
+    #[test]
+    fn parse3() {
+        let line =
+            "scf                      ; Simple_Case_Folding         ; sfc\n";
+        let row: PropertyAlias = line.parse().unwrap();
+        assert_eq!(row.abbreviation, "scf");
+        assert_eq!(row.long, "Simple_Case_Folding");
+        assert_eq!(row.aliases, vec!["sfc"]);
+    }
+
+    #[test]
+    fn parse4() {
+        let line = "cjkRSUnicode             ; kRSUnicode                  ; Unicode_Radical_Stroke; URS\n";
+        let row: PropertyAlias = line.parse().unwrap();
+        assert_eq!(row.abbreviation, "cjkRSUnicode");
+        assert_eq!(row.long, "kRSUnicode");
+        assert_eq!(row.aliases, vec!["Unicode_Radical_Stroke", "URS"]);
+    }
+
+    #[test]
+    fn parse5() {
+        let line = "isc                      ; ISO_Comment";
+        let row: PropertyAlias = line.parse().unwrap();
+        assert_eq!(row.abbreviation, "isc");
+        assert_eq!(row.long, "ISO_Comment");
+        assert!(row.aliases.is_empty());
+    }
+}
diff --git a/vendor/ucd-parse/src/property_value_aliases.rs b/vendor/ucd-parse/src/property_value_aliases.rs
new file mode 100644
index 000000000..7e8a3c890
--- /dev/null
+++ b/vendor/ucd-parse/src/property_value_aliases.rs
@@ -0,0 +1,185 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::UcdFile;
+use crate::error::Error;
+
+/// A single row in the `PropertyValueAliases.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct PropertyValueAlias {
+    /// The property name for which this value alias applies.
+    pub property: String,
+    /// A numeric abbreviation for this property value, if present. (This is
+    /// seemingly only present for the `ccc`/`Canonical_Combining_Class`
+    /// property.)
+    pub numeric: Option<u8>,
+    /// An abbreviation for this property value.
+    pub abbreviation: String,
+    /// The "long" form of this property value.
+    pub long: String,
+    /// Additional value aliases (if present).
+    pub aliases: Vec<String>,
+}
+
+impl UcdFile for PropertyValueAlias {
+    fn relative_file_path() -> &'static Path {
+        Path::new("PropertyValueAliases.txt")
+    }
+}
+
+impl FromStr for PropertyValueAlias {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<PropertyValueAlias, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                \s*(?P<prop>[^\s;]+)\s*;
+                \s*(?P<abbrev>[^\s;]+)\s*;
+                \s*(?P<long>[^\s;]+)\s*
+                (?:;(?P<aliases>.*))?
+                "
+            )
+            .unwrap();
+            static ref PARTS_CCC: Regex = Regex::new(
+                r"(?x)
+                ^
+                ccc;
+                \s*(?P<num_class>[0-9]+)\s*;
+                \s*(?P<abbrev>[^\s;]+)\s*;
+                \s*(?P<long>[^\s;]+)
+                "
+            )
+            .unwrap();
+            static ref ALIASES: Regex =
+                Regex::new(r"\s*(?P<alias>[^\s;]+)\s*;?\s*").unwrap();
+        };
+
+        if line.starts_with("ccc;") {
+            let caps = match PARTS_CCC.captures(line.trim()) {
+                Some(caps) => caps,
+                None => {
+                    return err!("invalid PropertyValueAliases (ccc) line")
+                }
+            };
+            let n = match caps["num_class"].parse() {
+                Ok(n) => n,
+                Err(err) => {
+                    return err!(
+                        "failed to parse ccc number '{}': {}",
+                        &caps["num_class"],
+                        err
+                    )
+                }
+            };
+            let abbrev = caps.name("abbrev").unwrap().as_str();
+            let long = caps.name("long").unwrap().as_str();
+            return Ok(PropertyValueAlias {
+                property: line[0..3].to_string(),
+                numeric: Some(n),
+                abbreviation: abbrev.to_string(),
+                long: long.to_string(),
+                aliases: vec![],
+            });
+        }
+
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid PropertyValueAliases line"),
+        };
+        let mut aliases = vec![];
+        if let Some(m) = caps.name("aliases") {
+            for acaps in ALIASES.captures_iter(m.as_str()) {
+                let alias = acaps.name("alias").unwrap().as_str();
+                if alias == "#" {
+                    // This starts a comment, so stop reading.
+                    break;
+                }
+                aliases.push(alias.to_string());
+            }
+        }
+        Ok(PropertyValueAlias {
+            property: caps.name("prop").unwrap().as_str().to_string(),
+            numeric: None,
+            abbreviation: caps.name("abbrev").unwrap().as_str().to_string(),
+            long: caps.name("long").unwrap().as_str().to_string(),
+            aliases,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::PropertyValueAlias;
+
+    #[test]
+    fn parse1() {
+        let line = "blk; Arabic_PF_A                      ; Arabic_Presentation_Forms_A      ; Arabic_Presentation_Forms-A\n";
+        let row: PropertyValueAlias = line.parse().unwrap();
+        assert_eq!(row.property, "blk");
+        assert_eq!(row.numeric, None);
+        assert_eq!(row.abbreviation, "Arabic_PF_A");
+        assert_eq!(row.long, "Arabic_Presentation_Forms_A");
+        assert_eq!(row.aliases, vec!["Arabic_Presentation_Forms-A"]);
+    }
+
+    #[test]
+    fn parse2() {
+        let line = "AHex; N                               ; No                               ; F                                ; False\n";
+        let row: PropertyValueAlias = line.parse().unwrap();
+        assert_eq!(row.property, "AHex");
+        assert_eq!(row.numeric, None);
+        assert_eq!(row.abbreviation, "N");
+        assert_eq!(row.long, "No");
+        assert_eq!(row.aliases, vec!["F", "False"]);
+    }
+
+    #[test]
+    fn parse3() {
+        let line = "age; 1.1                              ; V1_1\n";
+        let row: PropertyValueAlias = line.parse().unwrap();
+        assert_eq!(row.property, "age");
+        assert_eq!(row.numeric, None);
+        assert_eq!(row.abbreviation, "1.1");
+        assert_eq!(row.long, "V1_1");
+        assert!(row.aliases.is_empty());
+    }
+
+    #[test]
+    fn parse4() {
+        let line = "ccc;   0; NR                         ; Not_Reordered\n";
+        let row: PropertyValueAlias = line.parse().unwrap();
+        assert_eq!(row.property, "ccc");
+        assert_eq!(row.numeric, Some(0));
+        assert_eq!(row.abbreviation, "NR");
+        assert_eq!(row.long, "Not_Reordered");
+        assert!(row.aliases.is_empty());
+    }
+
+    #[test]
+    fn parse5() {
+        let line =
+            "ccc; 133; CCC133                     ; CCC133 # RESERVED\n";
+        let row: PropertyValueAlias = line.parse().unwrap();
+        assert_eq!(row.property, "ccc");
+        assert_eq!(row.numeric, Some(133));
+        assert_eq!(row.abbreviation, "CCC133");
+        assert_eq!(row.long, "CCC133");
+        assert!(row.aliases.is_empty());
+    }
+
+    #[test]
+    fn parse6() {
+        let line = "gc ; P                                ; Punctuation                      ; punct                            # Pc | Pd | Pe | Pf | Pi | Po | Ps\n";
+        let row: PropertyValueAlias = line.parse().unwrap();
+        assert_eq!(row.property, "gc");
+        assert_eq!(row.numeric, None);
+        assert_eq!(row.abbreviation, "P");
+        assert_eq!(row.long, "Punctuation");
+        assert_eq!(row.aliases, vec!["punct"]);
+    }
+}
diff --git a/vendor/ucd-parse/src/script_extensions.rs b/vendor/ucd-parse/src/script_extensions.rs
new file mode 100644
index 000000000..050e1f039
--- /dev/null
+++ b/vendor/ucd-parse/src/script_extensions.rs
@@ -0,0 +1,68 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+    parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+    UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `ScriptExtensions.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct ScriptExtension {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The script extension names assigned to the codepoints in this entry.
+    pub scripts: Vec<String>,
+}
+
+impl UcdFile for ScriptExtension {
+    fn relative_file_path() -> &'static Path {
+        Path::new("ScriptExtensions.txt")
+    }
+}
+
+impl UcdFileByCodepoint for ScriptExtension {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for ScriptExtension {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<ScriptExtension, Error> {
+        let (codepoints, scripts) = parse_codepoint_association(line)?;
+        Ok(ScriptExtension {
+            codepoints,
+            scripts: scripts.split_whitespace().map(str::to_string).collect(),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::ScriptExtension;
+
+    #[test]
+    fn parse_single() {
+        let line = "060C          ; Arab Syrc Thaa # Po       ARABIC COMMA\n";
+        let row: ScriptExtension = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x060C);
+        assert_eq!(row.scripts, vec!["Arab", "Syrc", "Thaa"]);
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "A836..A837    ; Deva Gujr Guru Kthi Mahj Modi Sind Takr Tirh # So   [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK\n";
+        let row: ScriptExtension = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0xA836, 0xA837));
+        assert_eq!(
+            row.scripts,
+            vec![
+                "Deva", "Gujr", "Guru", "Kthi", "Mahj", "Modi", "Sind",
+                "Takr", "Tirh",
+            ]
+        );
+    }
+}
diff --git a/vendor/ucd-parse/src/scripts.rs b/vendor/ucd-parse/src/scripts.rs
new file mode 100644
index 000000000..6021912c4
--- /dev/null
+++ b/vendor/ucd-parse/src/scripts.rs
@@ -0,0 +1,59 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+    parse_codepoint_association, CodepointIter, Codepoints, UcdFile,
+    UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `Scripts.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct Script {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The script name assigned to the codepoints in this entry.
+    pub script: String,
+}
+
+impl UcdFile for Script {
+    fn relative_file_path() -> &'static Path {
+        Path::new("Scripts.txt")
+    }
+}
+
+impl UcdFileByCodepoint for Script {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for Script {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<Script, Error> {
+        let (codepoints, script) = parse_codepoint_association(line)?;
+        Ok(Script { codepoints, script: script.to_string() })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::Script;
+
+    #[test]
+    fn parse_single() {
+        let line = "10A7F         ; Old_South_Arabian # Po       OLD SOUTH ARABIAN NUMERIC INDICATOR\n";
+        let row: Script = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x10A7F);
+        assert_eq!(row.script, "Old_South_Arabian");
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "1200..1248    ; Ethiopic # Lo  [73] ETHIOPIC SYLLABLE HA..ETHIOPIC SYLLABLE QWA\n";
+        let row: Script = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0x1200, 0x1248));
+        assert_eq!(row.script, "Ethiopic");
+    }
+}
diff --git a/vendor/ucd-parse/src/sentence_break.rs b/vendor/ucd-parse/src/sentence_break.rs
new file mode 100644
index 000000000..74a6e8a08
--- /dev/null
+++ b/vendor/ucd-parse/src/sentence_break.rs
@@ -0,0 +1,101 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+    parse_break_test, parse_codepoint_association, CodepointIter, Codepoints,
+    UcdFile, UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `auxiliary/SentenceBreakProperty.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct SentenceBreak {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The property value assigned to the codepoints in this entry.
+    pub value: String,
+}
+
+impl UcdFile for SentenceBreak {
+    fn relative_file_path() -> &'static Path {
+        Path::new("auxiliary/SentenceBreakProperty.txt")
+    }
+}
+
+impl UcdFileByCodepoint for SentenceBreak {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for SentenceBreak {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<SentenceBreak, Error> {
+        let (codepoints, value) = parse_codepoint_association(line)?;
+        Ok(SentenceBreak { codepoints, value: value.to_string() })
+    }
+}
+
+/// A single row in the `auxiliary/SentenceBreakTest.txt` file.
+///
+/// This file defines tests for the sentence break algorithm.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct SentenceBreakTest {
+    /// Each string is a UTF-8 encoded group of codepoints that make up a
+    /// single sentence.
+    pub sentences: Vec<String>,
+    /// A human readable description of this test.
+    pub comment: String,
+}
+
+impl UcdFile for SentenceBreakTest {
+    fn relative_file_path() -> &'static Path {
+        Path::new("auxiliary/SentenceBreakTest.txt")
+    }
+}
+
+impl FromStr for SentenceBreakTest {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<SentenceBreakTest, Error> {
+        let (groups, comment) = parse_break_test(line)?;
+        Ok(SentenceBreakTest { sentences: groups, comment })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{SentenceBreak, SentenceBreakTest};
+
+    #[test]
+    fn parse_single() {
+        let line = "11445         ; Extend # Mc       NEWA SIGN VISARGA\n";
+        let row: SentenceBreak = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x11445);
+        assert_eq!(row.value, "Extend");
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "FE31..FE32    ; SContinue # Pd   [2] PRESENTATION FORM FOR VERTICAL EM DASH..PRESENTATION FORM FOR VERTICAL EN DASH\n";
+        let row: SentenceBreak = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0xFE31, 0xFE32));
+        assert_eq!(row.value, "SContinue");
+    }
+
+    #[test]
+    fn parse_test() {
+        let line = "÷ 2060 × 5B57 × 2060 × 002E × 2060 ÷ 5B57 × 2060 × 2060 ÷	#  ÷ [0.2] WORD JOINER (Format_FE) × [998.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [998.0] FULL STOP (ATerm) × [5.0] WORD JOINER (Format_FE) ÷ [11.0] CJK UNIFIED IDEOGRAPH-5B57 (OLetter) × [5.0] WORD JOINER (Format_FE) × [5.0] WORD JOINER (Format_FE) ÷ [0.3]";
+
+        let row: SentenceBreakTest = line.parse().unwrap();
+        assert_eq!(
+            row.sentences,
+            vec![
+                "\u{2060}\u{5B57}\u{2060}\u{002E}\u{2060}",
+                "\u{5B57}\u{2060}\u{2060}",
+            ]
+        );
+        assert!(row.comment.contains("[5.0] WORD JOINER (Format_FE)"));
+    }
+}
diff --git a/vendor/ucd-parse/src/special_casing.rs b/vendor/ucd-parse/src/special_casing.rs
new file mode 100644
index 000000000..a8fc61ddb
--- /dev/null
+++ b/vendor/ucd-parse/src/special_casing.rs
@@ -0,0 +1,112 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{
+    parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile,
+    UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `SpecialCasing.txt` file.
+///
+/// Note that a single codepoint may be mapped multiple times. In particular,
+/// a single codepoint might have mappings based on distinct language sensitive
+/// conditions (e.g., `U+0307`).
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct SpecialCaseMapping {
+    /// The codepoint that is being mapped.
+    pub codepoint: Codepoint,
+    /// The lowercase mapping, which may be empty.
+    pub lowercase: Vec<Codepoint>,
+    /// The titlecase mapping, which may be empty.
+    pub titlecase: Vec<Codepoint>,
+    /// The uppercase mapping, which may be empty.
+    pub uppercase: Vec<Codepoint>,
+    /// A list of language specific conditions, see `SpecialCasing.txt` for
+    /// more details.
+    pub conditions: Vec<String>,
+}
+
+impl UcdFile for SpecialCaseMapping {
+    fn relative_file_path() -> &'static Path {
+        Path::new("SpecialCasing.txt")
+    }
+}
+
+impl UcdFileByCodepoint for SpecialCaseMapping {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoint.into_iter()
+    }
+}
+
+impl FromStr for SpecialCaseMapping {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                \s*(?P<codepoint>[^\s;]+)\s*;
+                \s*(?P<lower>[^;]+)\s*;
+                \s*(?P<title>[^;]+)\s*;
+                \s*(?P<upper>[^;]+)\s*;
+                \s*(?P<conditions>[^;\x23]+)?
+                "
+            )
+            .unwrap();
+        };
+
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid SpecialCasing line: '{}'", line),
+        };
+        let conditions = caps
+            .name("conditions")
+            .map(|x| {
+                x.as_str()
+                    .trim()
+                    .split_whitespace()
+                    .map(|c| c.to_string())
+                    .collect()
+            })
+            .unwrap_or(vec![]);
+        Ok(SpecialCaseMapping {
+            codepoint: caps["codepoint"].parse()?,
+            lowercase: parse_codepoint_sequence(&caps["lower"])?,
+            titlecase: parse_codepoint_sequence(&caps["title"])?,
+            uppercase: parse_codepoint_sequence(&caps["upper"])?,
+            conditions,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::SpecialCaseMapping;
+
+    #[test]
+    fn parse_no_conds() {
+        let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA\n";
+        let row: SpecialCaseMapping = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x1F52);
+        assert_eq!(row.lowercase, vec![0x1F52]);
+        assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]);
+        assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]);
+        assert!(row.conditions.is_empty());
+    }
+
+    #[test]
+    fn parse_conds() {
+        let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\n";
+        let row: SpecialCaseMapping = line.parse().unwrap();
+        assert_eq!(row.codepoint, 0x0307);
+        assert!(row.lowercase.is_empty());
+        assert_eq!(row.titlecase, vec![0x0307]);
+        assert_eq!(row.uppercase, vec![0x0307]);
+        assert_eq!(row.conditions, vec!["tr", "After_I"]);
+    }
+}
diff --git a/vendor/ucd-parse/src/unicode_data.rs b/vendor/ucd-parse/src/unicode_data.rs
new file mode 100644
index 000000000..87910cc1d
--- /dev/null
+++ b/vendor/ucd-parse/src/unicode_data.rs
@@ -0,0 +1,787 @@
+use std::fmt;
+use std::iter;
+use std::ops::Range;
+use std::path::Path;
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
+use crate::error::Error;
+
+/// Represents a single row in the `UnicodeData.txt` file.
+///
+/// These fields were taken from UAX44, Table 9, as part of the documentation
+/// for the
+/// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt).
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct UnicodeData {
+    /// The codepoint corresponding to this row.
+    pub codepoint: Codepoint,
+    /// The name of this codepoint.
+    pub name: String,
+    /// The "general category" of this codepoint.
+    pub general_category: String,
+    /// The class of this codepoint used in the Canonical Ordering Algorithm.
+    ///
+    /// Note that some classes map to a particular symbol. See
+    /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
+    pub canonical_combining_class: u8,
+    /// The bidirectional class of this codepoint.
+    ///
+    /// Possible values are listed in
+    /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values).
+    pub bidi_class: String,
+    /// The decomposition mapping for this codepoint. This includes its
+    /// formatting tag (if present).
+    pub decomposition: UnicodeDataDecomposition,
+    /// A decimal numeric representation of this codepoint, if it has the
+    /// property `Numeric_Type=Decimal`.
+    pub numeric_type_decimal: Option<u8>,
+    /// A decimal numeric representation of this codepoint, if it has the
+    /// property `Numeric_Type=Digit`. Note that while this field is still
+    /// populated for existing codepoints, no new codepoints will have this
+    /// field populated.
+    pub numeric_type_digit: Option<u8>,
+    /// A decimal or rational numeric representation of this codepoint, if it
+    /// has the property `Numeric_Type=Numeric`.
+    pub numeric_type_numeric: Option<UnicodeDataNumeric>,
+    /// A boolean indicating whether this codepoint is "mirrored" in
+    /// bidirectional text.
+    pub bidi_mirrored: bool,
+    /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
+    /// this field is empty unless it is significantly different from
+    /// the `name` field.
+    pub unicode1_name: String,
+    /// The ISO 10464 comment field. This no longer contains any non-NULL
+    /// values.
+    pub iso_comment: String,
+    /// This codepoint's simple uppercase mapping, if it exists.
+    pub simple_uppercase_mapping: Option<Codepoint>,
+    /// This codepoint's simple lowercase mapping, if it exists.
+    pub simple_lowercase_mapping: Option<Codepoint>,
+    /// This codepoint's simple titlecase mapping, if it exists.
+    pub simple_titlecase_mapping: Option<Codepoint>,
+}
+
+impl UcdFile for UnicodeData {
+    fn relative_file_path() -> &'static Path {
+        Path::new("UnicodeData.txt")
+    }
+}
+
+impl UcdFileByCodepoint for UnicodeData {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoint.into_iter()
+    }
+}
+
+impl UnicodeData {
+    /// Returns true if and only if this record corresponds to the start of a
+    /// range.
+    pub fn is_range_start(&self) -> bool {
+        self.name.starts_with('<')
+            && self.name.ends_with('>')
+            && self.name.contains("First")
+    }
+
+    /// Returns true if and only if this record corresponds to the end of a
+    /// range.
+    pub fn is_range_end(&self) -> bool {
+        self.name.starts_with('<')
+            && self.name.ends_with('>')
+            && self.name.contains("Last")
+    }
+}
+
+impl FromStr for UnicodeData {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<UnicodeData, Error> {
+        lazy_static! {
+            static ref PARTS: Regex = Regex::new(
+                r"(?x)
+                ^
+                ([A-Z0-9]+);  #  1; codepoint
+                ([^;]+);      #  2; name
+                ([^;]+);      #  3; general category
+                ([0-9]+);     #  4; canonical combining class
+                ([^;]+);      #  5; bidi class
+                ([^;]*);      #  6; decomposition
+                ([0-9]*);     #  7; numeric type decimal
+                ([0-9]*);     #  8; numeric type digit
+                ([-0-9/]*);   #  9; numeric type numeric
+                ([YN]);       # 10; bidi mirrored
+                ([^;]*);      # 11; unicode1 name
+                ([^;]*);      # 12; ISO comment
+                ([^;]*);      # 13; simple uppercase mapping
+                ([^;]*);      # 14; simple lowercase mapping
+                ([^;]*)       # 15; simple titlecase mapping
+                $
+                "
+            )
+            .unwrap();
+        };
+        let caps = match PARTS.captures(line.trim()) {
+            Some(caps) => caps,
+            None => return err!("invalid UnicodeData line"),
+        };
+        let capget = |n| caps.get(n).unwrap().as_str();
+        let mut data = UnicodeData::default();
+
+        data.codepoint = capget(1).parse()?;
+        data.name = capget(2).to_string();
+        data.general_category = capget(3).to_string();
+        data.canonical_combining_class = match capget(4).parse() {
+            Ok(n) => n,
+            Err(err) => {
+                return err!(
+                    "failed to parse canonical combining class '{}': {}",
+                    capget(4),
+                    err
+                )
+            }
+        };
+        data.bidi_class = capget(5).to_string();
+        if !caps[6].is_empty() {
+            data.decomposition = caps[6].parse()?;
+        } else {
+            data.decomposition.push(data.codepoint)?;
+        }
+        if !capget(7).is_empty() {
+            data.numeric_type_decimal = Some(match capget(7).parse() {
+                Ok(n) => n,
+                Err(err) => {
+                    return err!(
+                        "failed to parse numeric type decimal '{}': {}",
+                        capget(7),
+                        err
+                    )
+                }
+            });
+        }
+        if !capget(8).is_empty() {
+            data.numeric_type_digit = Some(match capget(8).parse() {
+                Ok(n) => n,
+                Err(err) => {
+                    return err!(
+                        "failed to parse numeric type digit '{}': {}",
+                        capget(8),
+                        err
+                    )
+                }
+            });
+        }
+        if !capget(9).is_empty() {
+            data.numeric_type_numeric = Some(capget(9).parse()?);
+        }
+        data.bidi_mirrored = capget(10) == "Y";
+        data.unicode1_name = capget(11).to_string();
+        data.iso_comment = capget(12).to_string();
+        if !capget(13).is_empty() {
+            data.simple_uppercase_mapping = Some(capget(13).parse()?);
+        }
+        if !capget(14).is_empty() {
+            data.simple_lowercase_mapping = Some(capget(14).parse()?);
+        }
+        if !capget(15).is_empty() {
+            data.simple_titlecase_mapping = Some(capget(15).parse()?);
+        }
+        Ok(data)
+    }
+}
+
+impl fmt::Display for UnicodeData {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{};", self.codepoint)?;
+        write!(f, "{};", self.name)?;
+        write!(f, "{};", self.general_category)?;
+        write!(f, "{};", self.canonical_combining_class)?;
+        write!(f, "{};", self.bidi_class)?;
+        if self.decomposition.is_canonical()
+            && self.decomposition.mapping() == &[self.codepoint]
+        {
+            write!(f, ";")?;
+        } else {
+            write!(f, "{};", self.decomposition)?;
+        }
+        if let Some(n) = self.numeric_type_decimal {
+            write!(f, "{};", n)?;
+        } else {
+            write!(f, ";")?;
+        }
+        if let Some(n) = self.numeric_type_digit {
+            write!(f, "{};", n)?;
+        } else {
+            write!(f, ";")?;
+        }
+        if let Some(n) = self.numeric_type_numeric {
+            write!(f, "{};", n)?;
+        } else {
+            write!(f, ";")?;
+        }
+        write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
+        write!(f, "{};", self.unicode1_name)?;
+        write!(f, "{};", self.iso_comment)?;
+        if let Some(cp) = self.simple_uppercase_mapping {
+            write!(f, "{};", cp)?;
+        } else {
+            write!(f, ";")?;
+        }
+        if let Some(cp) = self.simple_lowercase_mapping {
+            write!(f, "{};", cp)?;
+        } else {
+            write!(f, ";")?;
+        }
+        if let Some(cp) = self.simple_titlecase_mapping {
+            write!(f, "{}", cp)?;
+        }
+        Ok(())
+    }
+}
+
+/// Represents a decomposition mapping of a single row in the
+/// `UnicodeData.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct UnicodeDataDecomposition {
+    /// The formatting tag associated with this mapping, if present.
+    pub tag: Option<UnicodeDataDecompositionTag>,
+    /// The number of codepoints in this mapping.
+    pub len: usize,
+    /// The codepoints in the mapping. Entries beyond `len` in the mapping
+    /// are always U+0000. If no mapping was present, then this always contains
+    /// a single codepoint corresponding to this row's character.
+    pub mapping: [Codepoint; 18],
+}
+
+impl UnicodeDataDecomposition {
+    /// Create a new decomposition mapping with the given tag and codepoints.
+    ///
+    /// If there are too many codepoints, then an error is returned.
+    pub fn new(
+        tag: Option<UnicodeDataDecompositionTag>,
+        mapping: &[Codepoint],
+    ) -> Result<UnicodeDataDecomposition, Error> {
+        let mut x = UnicodeDataDecomposition::default();
+        x.tag = tag;
+        for &cp in mapping {
+            x.push(cp)?;
+        }
+        Ok(x)
+    }
+
+    /// Add a new codepoint to this decomposition's mapping.
+    ///
+    /// If the mapping is already full, then this returns an error.
+    pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
+        if self.len >= self.mapping.len() {
+            return err!(
+                "invalid decomposition mapping (too many codepoints)"
+            );
+        }
+        self.mapping[self.len] = cp;
+        self.len += 1;
+        Ok(())
+    }
+
+    /// Return the mapping as a slice of codepoints. The slice returned
+    /// has length equivalent to the number of codepoints in this mapping.
+    pub fn mapping(&self) -> &[Codepoint] {
+        &self.mapping[..self.len]
+    }
+
+    /// Returns true if and only if this decomposition mapping is canonical.
+    pub fn is_canonical(&self) -> bool {
+        self.tag.is_none()
+    }
+}
+
+impl FromStr for UnicodeDataDecomposition {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
+        lazy_static! {
+            static ref WITH_TAG: Regex = Regex::new(
+                r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$"
+            )
+            .unwrap();
+            static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap();
+        };
+        if s.is_empty() {
+            return err!(
+                "expected non-empty string for \
+                 UnicodeDataDecomposition value"
+            );
+        }
+        let caps = match WITH_TAG.captures(s) {
+            Some(caps) => caps,
+            None => return err!("invalid decomposition value"),
+        };
+        let mut decomp = UnicodeDataDecomposition::default();
+        let mut codepoints = s;
+        if let Some(m) = caps.name("tag") {
+            decomp.tag = Some(m.as_str().parse()?);
+            codepoints = &caps["chars"];
+        }
+        for m in CHARS.find_iter(codepoints) {
+            let cp = m.as_str().parse()?;
+            decomp.push(cp)?;
+        }
+        Ok(decomp)
+    }
+}
+
+impl fmt::Display for UnicodeDataDecomposition {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if let Some(ref tag) = self.tag {
+            write!(f, "<{}> ", tag)?;
+        }
+        let mut first = true;
+        for cp in self.mapping() {
+            if !first {
+                write!(f, " ")?;
+            }
+            first = false;
+            write!(f, "{}", cp)?;
+        }
+        Ok(())
+    }
+}
+
+/// The formatting tag on a decomposition mapping.
+///
+/// This is taken from
+/// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub enum UnicodeDataDecompositionTag {
+    /// <font>
+    Font,
+    /// <noBreak>
+    NoBreak,
+    /// <initial>
+    Initial,
+    /// <medial>
+    Medial,
+    /// <final>
+    Final,
+    /// <isolated>
+    Isolated,
+    /// <circle>
+    Circle,
+    /// <super>
+    Super,
+    /// <sub>
+    Sub,
+    /// <vertical>
+    Vertical,
+    /// <wide>
+    Wide,
+    /// <narrow>
+    Narrow,
+    /// <small>
+    Small,
+    /// <square>
+    Square,
+    /// <fraction>
+    Fraction,
+    /// <compat>
+    Compat,
+}
+
+impl FromStr for UnicodeDataDecompositionTag {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
+        use self::UnicodeDataDecompositionTag::*;
+        Ok(match s {
+            "font" => Font,
+            "noBreak" => NoBreak,
+            "initial" => Initial,
+            "medial" => Medial,
+            "final" => Final,
+            "isolated" => Isolated,
+            "circle" => Circle,
+            "super" => Super,
+            "sub" => Sub,
+            "vertical" => Vertical,
+            "wide" => Wide,
+            "narrow" => Narrow,
+            "small" => Small,
+            "square" => Square,
+            "fraction" => Fraction,
+            "compat" => Compat,
+            _ => return err!("invalid decomposition formatting tag: {}", s),
+        })
+    }
+}
+
+impl fmt::Display for UnicodeDataDecompositionTag {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use self::UnicodeDataDecompositionTag::*;
+        let s = match *self {
+            Font => "font",
+            NoBreak => "noBreak",
+            Initial => "initial",
+            Medial => "medial",
+            Final => "final",
+            Isolated => "isolated",
+            Circle => "circle",
+            Super => "super",
+            Sub => "sub",
+            Vertical => "vertical",
+            Wide => "wide",
+            Narrow => "narrow",
+            Small => "small",
+            Square => "square",
+            Fraction => "fraction",
+            Compat => "compat",
+        };
+        write!(f, "{}", s)
+    }
+}
+
+/// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
+///
+/// A numeric value can either be a signed integer or a rational number.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum UnicodeDataNumeric {
+    /// An integer.
+    Integer(i64),
+    /// A rational number. The first is the numerator and the latter is the
+    /// denominator.
+    Rational(i64, i64),
+}
+
+impl FromStr for UnicodeDataNumeric {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
+        if s.is_empty() {
+            return err!(
+                "expected non-empty string for UnicodeDataNumeric value"
+            );
+        }
+        if let Some(pos) = s.find('/') {
+            let (snum, sden) = (&s[..pos], &s[pos + 1..]);
+            let num = match snum.parse() {
+                Ok(num) => num,
+                Err(err) => {
+                    return err!(
+                        "invalid integer numerator '{}': {}",
+                        snum,
+                        err
+                    );
+                }
+            };
+            let den = match sden.parse() {
+                Ok(den) => den,
+                Err(err) => {
+                    return err!(
+                        "invalid integer denominator '{}': {}",
+                        sden,
+                        err
+                    );
+                }
+            };
+            Ok(UnicodeDataNumeric::Rational(num, den))
+        } else {
+            match s.parse() {
+                Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
+                Err(err) => {
+                    return err!(
+                        "invalid integer denominator '{}': {}",
+                        s,
+                        err
+                    );
+                }
+            }
+        }
+    }
+}
+
+impl fmt::Display for UnicodeDataNumeric {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            UnicodeDataNumeric::Integer(n) => write!(f, "{}", n),
+            UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d),
+        }
+    }
+}
+
+/// An iterator adapter that expands rows in `UnicodeData.txt`.
+///
+/// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
+/// represented. Instead, they are represented by a pair of rows, indicating
+/// a range of codepoints with the same properties. For example, the Hangul
+/// syllable codepoints are represented by these two rows:
+///
+/// ```ignore
+/// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+/// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+/// ```
+///
+/// This iterator will wrap any iterator of `UnicodeData` and, when a range of
+/// Unicode codepoints is found, it will be expanded to the appropriate
+/// sequence of `UnicodeData` values. Note that all such expanded records will
+/// have an empty name.
+pub struct UnicodeDataExpander<I: Iterator> {
+    /// The underlying iterator.
+    it: iter::Peekable<I>,
+    /// A range of codepoints to emit when we've found a pair. Otherwise,
+    /// `None`.
+    range: CodepointRange,
+}
+
+struct CodepointRange {
+    /// The codepoint range.
+    range: Range<u32>,
+    /// The start record. All subsequent records in this range are generated
+    /// by cloning this and updating the codepoint/name.
+    start_record: UnicodeData,
+}
+
+impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
+    /// Create a new iterator that expands pairs of `UnicodeData` range
+    /// records. All other records are passed through as-is.
+    pub fn new<T>(it: T) -> UnicodeDataExpander<I>
+    where
+        T: IntoIterator<IntoIter = I, Item = I::Item>,
+    {
+        UnicodeDataExpander {
+            it: it.into_iter().peekable(),
+            range: CodepointRange {
+                range: 0..0,
+                start_record: UnicodeData::default(),
+            },
+        }
+    }
+}
+
+impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
+    type Item = UnicodeData;
+
+    fn next(&mut self) -> Option<UnicodeData> {
+        if let Some(udata) = self.range.next() {
+            return Some(udata);
+        }
+        let row1 = match self.it.next() {
+            None => return None,
+            Some(row1) => row1,
+        };
+        if !row1.is_range_start()
+            || !self.it.peek().map_or(false, |row2| row2.is_range_end())
+        {
+            return Some(row1);
+        }
+        let row2 = self.it.next().unwrap();
+        self.range = CodepointRange {
+            range: row1.codepoint.value()..(row2.codepoint.value() + 1),
+            start_record: row1,
+        };
+        self.next()
+    }
+}
+
+impl Iterator for CodepointRange {
+    type Item = UnicodeData;
+
+    fn next(&mut self) -> Option<UnicodeData> {
+        let cp = match self.range.next() {
+            None => return None,
+            Some(cp) => cp,
+        };
+        Some(UnicodeData {
+            codepoint: Codepoint::from_u32(cp).unwrap(),
+            name: "".to_string(),
+            ..self.start_record.clone()
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::common::Codepoint;
+
+    use super::{
+        UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
+        UnicodeDataNumeric,
+    };
+
+    fn codepoint(n: u32) -> Codepoint {
+        Codepoint::from_u32(n).unwrap()
+    }
+
+    fn s(string: &str) -> String {
+        string.to_string()
+    }
+
+    #[test]
+    fn parse1() {
+        let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
+        let data: UnicodeData = line.parse().unwrap();
+        assert_eq!(
+            data,
+            UnicodeData {
+                codepoint: codepoint(0x249d),
+                name: s("PARENTHESIZED LATIN SMALL LETTER B"),
+                general_category: s("So"),
+                canonical_combining_class: 0,
+                bidi_class: s("L"),
+                decomposition: UnicodeDataDecomposition::new(
+                    Some(UnicodeDataDecompositionTag::Compat),
+                    &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
+                )
+                .unwrap(),
+                numeric_type_decimal: None,
+                numeric_type_digit: None,
+                numeric_type_numeric: None,
+                bidi_mirrored: false,
+                unicode1_name: s(""),
+                iso_comment: s(""),
+                simple_uppercase_mapping: None,
+                simple_lowercase_mapping: None,
+                simple_titlecase_mapping: None,
+            }
+        );
+    }
+
+    #[test]
+    fn parse2() {
+        let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
+        let data: UnicodeData = line.parse().unwrap();
+        assert_eq!(
+            data,
+            UnicodeData {
+                codepoint: codepoint(0x000D),
+                name: s("<control>"),
+                general_category: s("Cc"),
+                canonical_combining_class: 0,
+                bidi_class: s("B"),
+                decomposition: UnicodeDataDecomposition::new(
+                    None,
+                    &[codepoint(0x000D)]
+                )
+                .unwrap(),
+                numeric_type_decimal: None,
+                numeric_type_digit: None,
+                numeric_type_numeric: None,
+                bidi_mirrored: false,
+                unicode1_name: s("CARRIAGE RETURN (CR)"),
+                iso_comment: s(""),
+                simple_uppercase_mapping: None,
+                simple_lowercase_mapping: None,
+                simple_titlecase_mapping: None,
+            }
+        );
+    }
+
+    #[test]
+    fn parse3() {
+        let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
+        let data: UnicodeData = line.parse().unwrap();
+        assert_eq!(
+            data,
+            UnicodeData {
+                codepoint: codepoint(0x00BC),
+                name: s("VULGAR FRACTION ONE QUARTER"),
+                general_category: s("No"),
+                canonical_combining_class: 0,
+                bidi_class: s("ON"),
+                decomposition: UnicodeDataDecomposition::new(
+                    Some(UnicodeDataDecompositionTag::Fraction),
+                    &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
+                )
+                .unwrap(),
+                numeric_type_decimal: None,
+                numeric_type_digit: None,
+                numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
+                bidi_mirrored: false,
+                unicode1_name: s("FRACTION ONE QUARTER"),
+                iso_comment: s(""),
+                simple_uppercase_mapping: None,
+                simple_lowercase_mapping: None,
+                simple_titlecase_mapping: None,
+            }
+        );
+    }
+
+    #[test]
+    fn parse4() {
+        let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
+        let data: UnicodeData = line.parse().unwrap();
+        assert_eq!(
+            data,
+            UnicodeData {
+                codepoint: codepoint(0x0041),
+                name: s("LATIN CAPITAL LETTER A"),
+                general_category: s("Lu"),
+                canonical_combining_class: 0,
+                bidi_class: s("L"),
+                decomposition: UnicodeDataDecomposition::new(
+                    None,
+                    &[codepoint(0x0041)]
+                )
+                .unwrap(),
+                numeric_type_decimal: None,
+                numeric_type_digit: None,
+                numeric_type_numeric: None,
+                bidi_mirrored: false,
+                unicode1_name: s(""),
+                iso_comment: s(""),
+                simple_uppercase_mapping: None,
+                simple_lowercase_mapping: Some(codepoint(0x0061)),
+                simple_titlecase_mapping: None,
+            }
+        );
+    }
+
+    #[test]
+    fn parse5() {
+        let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
+        let data: UnicodeData = line.parse().unwrap();
+        assert_eq!(
+            data,
+            UnicodeData {
+                codepoint: codepoint(0x0F33),
+                name: s("TIBETAN DIGIT HALF ZERO"),
+                general_category: s("No"),
+                canonical_combining_class: 0,
+                bidi_class: s("L"),
+                decomposition: UnicodeDataDecomposition::new(
+                    None,
+                    &[codepoint(0x0F33)]
+                )
+                .unwrap(),
+                numeric_type_decimal: None,
+                numeric_type_digit: None,
+                numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
+                    -1, 2
+                )),
+                bidi_mirrored: false,
+                unicode1_name: s(""),
+                iso_comment: s(""),
+                simple_uppercase_mapping: None,
+                simple_lowercase_mapping: None,
+                simple_titlecase_mapping: None,
+            }
+        );
+    }
+
+    #[test]
+    fn expander() {
+        use super::UnicodeDataExpander;
+        use crate::common::UcdLineParser;
+
+        let data = "\
+ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
+AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
+D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
+D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
+";
+        let records = UcdLineParser::new(None, data.as_bytes())
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+        assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
+    }
+}
diff --git a/vendor/ucd-parse/src/word_break.rs b/vendor/ucd-parse/src/word_break.rs
new file mode 100644
index 000000000..57d512667
--- /dev/null
+++ b/vendor/ucd-parse/src/word_break.rs
@@ -0,0 +1,103 @@
+use std::path::Path;
+use std::str::FromStr;
+
+use crate::common::{
+    parse_break_test, parse_codepoint_association, CodepointIter, Codepoints,
+    UcdFile, UcdFileByCodepoint,
+};
+use crate::error::Error;
+
+/// A single row in the `auxiliary/WordBreakProperty.txt` file.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct WordBreak {
+    /// The codepoint or codepoint range for this entry.
+    pub codepoints: Codepoints,
+    /// The property value assigned to the codepoints in this entry.
+    pub value: String,
+}
+
+impl UcdFile for WordBreak {
+    fn relative_file_path() -> &'static Path {
+        Path::new("auxiliary/WordBreakProperty.txt")
+    }
+}
+
+impl UcdFileByCodepoint for WordBreak {
+    fn codepoints(&self) -> CodepointIter {
+        self.codepoints.into_iter()
+    }
+}
+
+impl FromStr for WordBreak {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<WordBreak, Error> {
+        let (codepoints, value) = parse_codepoint_association(line)?;
+        Ok(WordBreak { codepoints, value: value.to_string() })
+    }
+}
+
+/// A single row in the `auxiliary/WordBreakTest.txt` file.
+///
+/// This file defines tests for the word break algorithm.
+#[derive(Clone, Debug, Default, Eq, PartialEq)]
+pub struct WordBreakTest {
+    /// Each string is a UTF-8 encoded group of codepoints that make up a
+    /// single word.
+    pub words: Vec<String>,
+    /// A human readable description of this test.
+    pub comment: String,
+}
+
+impl UcdFile for WordBreakTest {
+    fn relative_file_path() -> &'static Path {
+        Path::new("auxiliary/WordBreakTest.txt")
+    }
+}
+
+impl FromStr for WordBreakTest {
+    type Err = Error;
+
+    fn from_str(line: &str) -> Result<WordBreakTest, Error> {
+        let (groups, comment) = parse_break_test(line)?;
+        Ok(WordBreakTest { words: groups, comment })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{WordBreak, WordBreakTest};
+
+    #[test]
+    fn parse_single() {
+        let line = "0A83          ; Extend # Mc       GUJARATI SIGN VISARGA\n";
+        let row: WordBreak = line.parse().unwrap();
+        assert_eq!(row.codepoints, 0x0A83);
+        assert_eq!(row.value, "Extend");
+    }
+
+    #[test]
+    fn parse_range() {
+        let line = "104A0..104A9  ; Numeric # Nd  [10] OSMANYA DIGIT ZERO..OSMANYA DIGIT NINE\n";
+        let row: WordBreak = line.parse().unwrap();
+        assert_eq!(row.codepoints, (0x104A0, 0x104A9));
+        assert_eq!(row.value, "Numeric");
+    }
+
+    #[test]
+    fn parse_test() {
+        let line = "÷ 0031 ÷ 0027 × 0308 ÷ 0061 ÷ 0027 × 2060 ÷	#  ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (Single_Quote) × [4.0] WORD JOINER (Format_FE) ÷ [0.3]";
+
+        let row: WordBreakTest = line.parse().unwrap();
+        assert_eq!(
+            row.words,
+            vec![
+                "\u{0031}",
+                "\u{0027}\u{0308}",
+                "\u{0061}",
+                "\u{0027}\u{2060}",
+            ]
+        );
+        assert!(row.comment.contains("[4.0] COMBINING DIAERESIS (Extend_FE)"));
+    }
+}
-- 
cgit v1.2.3