summaryrefslogtreecommitdiffstats
path: root/vendor/ucd-parse/src/common.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/ucd-parse/src/common.rs')
-rw-r--r--vendor/ucd-parse/src/common.rs594
1 files changed, 594 insertions, 0 deletions
diff --git a/vendor/ucd-parse/src/common.rs b/vendor/ucd-parse/src/common.rs
new file mode 100644
index 000000000..c18be668e
--- /dev/null
+++ b/vendor/ucd-parse/src/common.rs
@@ -0,0 +1,594 @@
+use std::char;
+use std::collections::BTreeMap;
+use std::fmt;
+use std::fs::File;
+use std::io::{self, BufRead};
+use std::marker::PhantomData;
+use std::path::{Path, PathBuf};
+use std::str::FromStr;
+
+use lazy_static::lazy_static;
+use regex::Regex;
+
+use crate::error::{Error, ErrorKind};
+
+/// Parse a particular file in the UCD into a sequence of rows.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse<P, D>(ucd_dir: P) -> Result<Vec<D>, Error>
+where
+ P: AsRef<Path>,
+ D: UcdFile,
+{
+ let mut xs = vec![];
+ for result in D::from_dir(ucd_dir)? {
+ let x = result?;
+ xs.push(x);
+ }
+ Ok(xs)
+}
+
+/// Parse a particular file in the UCD into a map from codepoint to the record.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse_by_codepoint<P, D>(
+ ucd_dir: P,
+) -> Result<BTreeMap<Codepoint, D>, Error>
+where
+ P: AsRef<Path>,
+ D: UcdFileByCodepoint,
+{
+ let mut map = BTreeMap::new();
+ for result in D::from_dir(ucd_dir)? {
+ let x = result?;
+ for cp in x.codepoints() {
+ map.insert(cp, x.clone());
+ }
+ }
+ Ok(map)
+}
+
+/// Parse a particular file in the UCD into a map from codepoint to all
+/// records associated with that codepoint.
+///
+/// This is useful for files that have multiple records for each codepoint.
+/// For example, the `NameAliases.txt` file lists multiple aliases for some
+/// codepoints.
+///
+/// The given directory should be the directory to the UCD.
+pub fn parse_many_by_codepoint<P, D>(
+ ucd_dir: P,
+) -> Result<BTreeMap<Codepoint, Vec<D>>, Error>
+where
+ P: AsRef<Path>,
+ D: UcdFileByCodepoint,
+{
+ let mut map = BTreeMap::new();
+ for result in D::from_dir(ucd_dir)? {
+ let x = result?;
+ for cp in x.codepoints() {
+ map.entry(cp).or_insert(vec![]).push(x.clone());
+ }
+ }
+ Ok(map)
+}
+
+/// Given a path pointing at the root of the `ucd_dir`, attempts to determine
+/// it's unicode version.
+///
+/// This just checks the readme and the very first line of PropList.txt -- in
+/// practice this works for all versions of UCD since 4.1.0.
+pub fn ucd_directory_version<D: ?Sized + AsRef<Path>>(
+ ucd_dir: &D,
+) -> Result<(u64, u64, u64), Error> {
+ // Avoid duplication from generic path parameter.
+ fn ucd_directory_version_inner(
+ ucd_dir: &Path,
+ ) -> Result<(u64, u64, u64), Error> {
+ lazy_static::lazy_static! {
+ static ref VERSION_RX: Regex =
+ Regex::new(r"-([0-9]+).([0-9]+).([0-9]+).txt").unwrap();
+ }
+
+ let proplist = ucd_dir.join("PropList.txt");
+ let contents = first_line(&proplist)?;
+ let caps = match VERSION_RX.captures(&contents) {
+ Some(c) => c,
+ None => {
+ return err!("Failed to find version in line {:?}", contents)
+ }
+ };
+
+ let capture_to_num = |n| {
+ caps.get(n).unwrap().as_str().parse::<u64>().map_err(|e| Error {
+ kind: ErrorKind::Parse(format!(
+ "Failed to parse version from {:?} in PropList.txt: {}",
+ contents, e
+ )),
+ line: Some(0),
+ path: Some(proplist.clone()),
+ })
+ };
+ let major = capture_to_num(1)?;
+ let minor = capture_to_num(2)?;
+ let patch = capture_to_num(3)?;
+
+ Ok((major, minor, patch))
+ }
+ ucd_directory_version_inner(ucd_dir.as_ref())
+}
+
+fn first_line(path: &Path) -> Result<String, Error> {
+ let file = std::fs::File::open(path).map_err(|e| Error {
+ kind: ErrorKind::Io(e),
+ line: None,
+ path: Some(path.into()),
+ })?;
+
+ let mut reader = std::io::BufReader::new(file);
+ let mut line_contents = String::new();
+ reader.read_line(&mut line_contents).map_err(|e| Error {
+ kind: ErrorKind::Io(e),
+ line: None,
+ path: Some(path.into()),
+ })?;
+ Ok(line_contents)
+}
+
+/// A helper function for parsing a common record format that associates one
+/// or more codepoints with a string value.
+pub fn parse_codepoint_association<'a>(
+ line: &'a str,
+) -> Result<(Codepoints, &'a str), Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ \s*(?P<codepoints>[^\s;]+)\s*;
+ \s*(?P<property>[^;\x23]+)\s*
+ "
+ )
+ .unwrap();
+ };
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid PropList line: '{}'", line),
+ };
+ let property = match caps.name("property") {
+ Some(property) => property.as_str().trim(),
+ None => {
+ return err!(
+ "could not find property name in PropList line: '{}'",
+ line
+ )
+ }
+ };
+ Ok((caps["codepoints"].parse()?, property))
+}
+
+/// A helper function for parsing a sequence of space separated codepoints.
+/// The sequence is permitted to be empty.
+pub fn parse_codepoint_sequence(s: &str) -> Result<Vec<Codepoint>, Error> {
+ let mut cps = vec![];
+ for cp in s.trim().split_whitespace() {
+ cps.push(cp.parse()?);
+ }
+ Ok(cps)
+}
+
+/// A helper function for parsing a single test for the various break
+/// algorithms.
+///
+/// Upon success, this returns the UTF-8 encoded groups of codepoints along
+/// with the comment associated with the test. The comment is a human readable
+/// description of the test that may prove useful for debugging.
+pub fn parse_break_test(line: &str) -> Result<(Vec<String>, String), Error> {
+ lazy_static! {
+ static ref PARTS: Regex = Regex::new(
+ r"(?x)
+ ^
+ (?:÷|×)
+ (?P<groups>(?:\s[0-9A-Fa-f]{4,5}\s(?:÷|×))+)
+ \s+
+ \#(?P<comment>.+)
+ $
+ "
+ )
+ .unwrap();
+ static ref GROUP: Regex = Regex::new(
+ r"(?x)
+ (?P<codepoint>[0-9A-Fa-f]{4,5})\s(?P<kind>÷|×)
+ "
+ )
+ .unwrap();
+ }
+
+ let caps = match PARTS.captures(line.trim()) {
+ Some(caps) => caps,
+ None => return err!("invalid break test line: '{}'", line),
+ };
+ let comment = caps["comment"].trim().to_string();
+
+ let mut groups = vec![];
+ let mut cur = String::new();
+ for cap in GROUP.captures_iter(&caps["groups"]) {
+ let cp: Codepoint = cap["codepoint"].parse()?;
+ let ch = match cp.scalar() {
+ Some(ch) => ch,
+ None => {
+ return err!(
+ "invalid codepoint '{:X}' in line: '{}'",
+ cp.value(),
+ line
+ )
+ }
+ };
+ cur.push(ch);
+ if &cap["kind"] == "÷" {
+ groups.push(cur);
+ cur = String::new();
+ }
+ }
+ Ok((groups, comment))
+}
+
+/// Describes a single UCD file.
+pub trait UcdFile:
+ Clone + fmt::Debug + Default + Eq + FromStr<Err = Error> + PartialEq
+{
+ /// The file path corresponding to this file, relative to the UCD
+ /// directory.
+ fn relative_file_path() -> &'static Path;
+
+ /// The full file path corresponding to this file given the UCD directory
+ /// path.
+ fn file_path<P: AsRef<Path>>(ucd_dir: P) -> PathBuf {
+ ucd_dir.as_ref().join(Self::relative_file_path())
+ }
+
+ /// Create an iterator over each record in this UCD file.
+ ///
+ /// The parameter should correspond to the directory containing the UCD.
+ fn from_dir<P: AsRef<Path>>(
+ ucd_dir: P,
+ ) -> Result<UcdLineParser<File, Self>, Error> {
+ UcdLineParser::from_path(Self::file_path(ucd_dir))
+ }
+}
+
+/// Describes a single UCD file where every record in the file is associated
+/// with one or more codepoints.
+pub trait UcdFileByCodepoint: UcdFile {
+ /// Returns the codepoints associated with this record.
+ fn codepoints(&self) -> CodepointIter;
+}
+
+/// A line oriented parser for a particular UCD file.
+///
+/// Callers can build a line parser via the
+/// [`UcdFile::from_dir`](trait.UcdFile.html) method.
+///
+/// The `R` type parameter refers to the underlying `io::Read` implementation
+/// from which the UCD data is read.
+///
+/// The `D` type parameter refers to the type of the record parsed out of each
+/// line.
+#[derive(Debug)]
+pub struct UcdLineParser<R, D> {
+ path: Option<PathBuf>,
+ rdr: io::BufReader<R>,
+ line: String,
+ line_number: u64,
+ _data: PhantomData<D>,
+}
+
+impl<D> UcdLineParser<File, D> {
+ /// Create a new parser from the given file path.
+ pub(crate) fn from_path<P: AsRef<Path>>(
+ path: P,
+ ) -> Result<UcdLineParser<File, D>, Error> {
+ let path = path.as_ref();
+ let file = File::open(path).map_err(|e| Error {
+ kind: ErrorKind::Io(e),
+ line: None,
+ path: Some(path.to_path_buf()),
+ })?;
+ Ok(UcdLineParser::new(Some(path.to_path_buf()), file))
+ }
+}
+
+impl<R: io::Read, D> UcdLineParser<R, D> {
+ /// Create a new parser that parses the reader given.
+ ///
+ /// The type of data parsed is determined when the `parse_next` function
+ /// is called by virtue of the type requested.
+ ///
+ /// Note that the reader is buffered internally, so the caller does not
+ /// need to provide their own buffering.
+ pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> UcdLineParser<R, D> {
+ UcdLineParser {
+ path,
+ rdr: io::BufReader::new(rdr),
+ line: String::new(),
+ line_number: 0,
+ _data: PhantomData,
+ }
+ }
+}
+
+impl<R: io::Read, D: FromStr<Err = Error>> Iterator for UcdLineParser<R, D> {
+ type Item = Result<D, Error>;
+
+ fn next(&mut self) -> Option<Result<D, Error>> {
+ loop {
+ self.line_number += 1;
+ self.line.clear();
+ let n = match self.rdr.read_line(&mut self.line) {
+ Err(err) => {
+ return Some(Err(Error {
+ kind: ErrorKind::Io(err),
+ line: None,
+ path: self.path.clone(),
+ }))
+ }
+ Ok(n) => n,
+ };
+ if n == 0 {
+ return None;
+ }
+ if !self.line.starts_with('#') && !self.line.trim().is_empty() {
+ break;
+ }
+ }
+ let line_number = self.line_number;
+ Some(self.line.parse().map_err(|mut err: Error| {
+ err.line = Some(line_number);
+ err
+ }))
+ }
+}
+
+/// A representation of either a single codepoint or a range of codepoints.
+#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub enum Codepoints {
+ /// A single codepoint.
+ Single(Codepoint),
+ /// A range of codepoints.
+ Range(CodepointRange),
+}
+
+impl Default for Codepoints {
+ fn default() -> Codepoints {
+ Codepoints::Single(Codepoint::default())
+ }
+}
+
+impl IntoIterator for Codepoints {
+ type IntoIter = CodepointIter;
+ type Item = Codepoint;
+
+ fn into_iter(self) -> CodepointIter {
+ match self {
+ Codepoints::Single(x) => x.into_iter(),
+ Codepoints::Range(x) => x.into_iter(),
+ }
+ }
+}
+
+impl FromStr for Codepoints {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<Codepoints, Error> {
+ if s.contains("..") {
+ CodepointRange::from_str(s).map(Codepoints::Range)
+ } else {
+ Codepoint::from_str(s).map(Codepoints::Single)
+ }
+ }
+}
+
+impl fmt::Display for Codepoints {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ Codepoints::Single(ref x) => x.fmt(f),
+ Codepoints::Range(ref x) => x.fmt(f),
+ }
+ }
+}
+
+impl PartialEq<u32> for Codepoints {
+ fn eq(&self, other: &u32) -> bool {
+ match *self {
+ Codepoints::Single(ref x) => x == other,
+ Codepoints::Range(ref x) => x == &(*other, *other),
+ }
+ }
+}
+
+impl PartialEq<Codepoint> for Codepoints {
+ fn eq(&self, other: &Codepoint) -> bool {
+ match *self {
+ Codepoints::Single(ref x) => x == other,
+ Codepoints::Range(ref x) => x == &(*other, *other),
+ }
+ }
+}
+
+impl PartialEq<(u32, u32)> for Codepoints {
+ fn eq(&self, other: &(u32, u32)) -> bool {
+ match *self {
+ Codepoints::Single(ref x) => &(x.value(), x.value()) == other,
+ Codepoints::Range(ref x) => x == other,
+ }
+ }
+}
+
+impl PartialEq<(Codepoint, Codepoint)> for Codepoints {
+ fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
+ match *self {
+ Codepoints::Single(ref x) => &(*x, *x) == other,
+ Codepoints::Range(ref x) => x == other,
+ }
+ }
+}
+
+/// A range of Unicode codepoints. The range is inclusive; both ends of the
+/// range are guaranteed to be valid codepoints.
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub struct CodepointRange {
+ /// The start of the codepoint range.
+ pub start: Codepoint,
+ /// The end of the codepoint range.
+ pub end: Codepoint,
+}
+
+impl IntoIterator for CodepointRange {
+ type IntoIter = CodepointIter;
+ type Item = Codepoint;
+
+ fn into_iter(self) -> CodepointIter {
+ CodepointIter { next: self.start.value(), range: self }
+ }
+}
+
+impl FromStr for CodepointRange {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<CodepointRange, Error> {
+ lazy_static! {
+ static ref PARTS: Regex =
+ Regex::new(r"^(?P<start>[A-Z0-9]+)\.\.(?P<end>[A-Z0-9]+)$")
+ .unwrap();
+ }
+ let caps = match PARTS.captures(s) {
+ Some(caps) => caps,
+ None => return err!("invalid codepoint range: '{}'", s),
+ };
+ let start = caps["start"].parse().or_else(|err| {
+ err!("failed to parse '{}' as a codepoint range: {}", s, err)
+ })?;
+ let end = caps["end"].parse().or_else(|err| {
+ err!("failed to parse '{}' as a codepoint range: {}", s, err)
+ })?;
+ Ok(CodepointRange { start, end })
+ }
+}
+
+impl fmt::Display for CodepointRange {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{}..{}", self.start, self.end)
+ }
+}
+
+impl PartialEq<(u32, u32)> for CodepointRange {
+ fn eq(&self, other: &(u32, u32)) -> bool {
+ &(self.start.value(), self.end.value()) == other
+ }
+}
+
+impl PartialEq<(Codepoint, Codepoint)> for CodepointRange {
+ fn eq(&self, other: &(Codepoint, Codepoint)) -> bool {
+ &(self.start, self.end) == other
+ }
+}
+
+/// A single Unicode codepoint.
+///
+/// This type's string representation is a hexadecimal number. It is guaranteed
+/// to be in the range `[0, 10FFFF]`.
+///
+/// Note that unlike Rust's `char` type, this may be a surrogate codepoint.
+#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub struct Codepoint(u32);
+
+impl Codepoint {
+ /// Create a new codepoint from a `u32`.
+ ///
+ /// If the given number is not a valid codepoint, then this returns an
+ /// error.
+ pub fn from_u32(n: u32) -> Result<Codepoint, Error> {
+ if n > 0x10FFFF {
+ err!("{:x} is not a valid Unicode codepoint", n)
+ } else {
+ Ok(Codepoint(n))
+ }
+ }
+
+ /// Return the underlying `u32` codepoint value.
+ pub fn value(self) -> u32 {
+ self.0
+ }
+
+ /// Attempt to convert this codepoint to a Unicode scalar value.
+ ///
+ /// If this is a surrogate codepoint, then this returns `None`.
+ pub fn scalar(self) -> Option<char> {
+ char::from_u32(self.0)
+ }
+}
+
+impl IntoIterator for Codepoint {
+ type IntoIter = CodepointIter;
+ type Item = Codepoint;
+
+ fn into_iter(self) -> CodepointIter {
+ let range = CodepointRange { start: self, end: self };
+ CodepointIter { next: self.value(), range }
+ }
+}
+
+impl FromStr for Codepoint {
+ type Err = Error;
+
+ fn from_str(s: &str) -> Result<Codepoint, Error> {
+ match u32::from_str_radix(s, 16) {
+ Ok(n) => Codepoint::from_u32(n),
+ Err(err) => {
+ return err!(
+ "failed to parse '{}' as a hexadecimal codepoint: {}",
+ s,
+ err
+ );
+ }
+ }
+ }
+}
+
+impl fmt::Display for Codepoint {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(f, "{:04X}", self.0)
+ }
+}
+
+impl PartialEq<u32> for Codepoint {
+ fn eq(&self, other: &u32) -> bool {
+ self.0 == *other
+ }
+}
+
+impl PartialEq<Codepoint> for u32 {
+ fn eq(&self, other: &Codepoint) -> bool {
+ *self == other.0
+ }
+}
+
+/// An iterator over a range of Unicode codepoints.
+#[derive(Debug)]
+pub struct CodepointIter {
+ next: u32,
+ range: CodepointRange,
+}
+
+impl Iterator for CodepointIter {
+ type Item = Codepoint;
+
+ fn next(&mut self) -> Option<Codepoint> {
+ if self.next > self.range.end.value() {
+ return None;
+ }
+ let current = self.next;
+ self.next += 1;
+ Some(Codepoint::from_u32(current).unwrap())
+ }
+}