summaryrefslogtreecommitdiffstats
path: root/vendor/ucd-parse/src/special_casing.rs
blob: a8fc61ddbf9e21dc475547a477423d2f5f8ffd6b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
use std::path::Path;
use std::str::FromStr;

use lazy_static::lazy_static;
use regex::Regex;

use crate::common::{
    parse_codepoint_sequence, Codepoint, CodepointIter, UcdFile,
    UcdFileByCodepoint,
};
use crate::error::Error;

/// A single row in the `SpecialCasing.txt` file.
///
/// Note that a single codepoint may be mapped multiple times. In particular,
/// a single codepoint might have mappings based on distinct language sensitive
/// conditions (e.g., `U+0307`).
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct SpecialCaseMapping {
    /// The codepoint that is being mapped.
    pub codepoint: Codepoint,
    /// The lowercase mapping, which may be empty.
    pub lowercase: Vec<Codepoint>,
    /// The titlecase mapping, which may be empty.
    pub titlecase: Vec<Codepoint>,
    /// The uppercase mapping, which may be empty.
    pub uppercase: Vec<Codepoint>,
    /// A list of language specific conditions, see `SpecialCasing.txt` for
    /// more details.
    pub conditions: Vec<String>,
}

impl UcdFile for SpecialCaseMapping {
    fn relative_file_path() -> &'static Path {
        Path::new("SpecialCasing.txt")
    }
}

impl UcdFileByCodepoint for SpecialCaseMapping {
    fn codepoints(&self) -> CodepointIter {
        self.codepoint.into_iter()
    }
}

impl FromStr for SpecialCaseMapping {
    type Err = Error;

    fn from_str(line: &str) -> Result<SpecialCaseMapping, Error> {
        lazy_static! {
            static ref PARTS: Regex = Regex::new(
                r"(?x)
                ^
                \s*(?P<codepoint>[^\s;]+)\s*;
                \s*(?P<lower>[^;]+)\s*;
                \s*(?P<title>[^;]+)\s*;
                \s*(?P<upper>[^;]+)\s*;
                \s*(?P<conditions>[^;\x23]+)?
                "
            )
            .unwrap();
        };

        let caps = match PARTS.captures(line.trim()) {
            Some(caps) => caps,
            None => return err!("invalid SpecialCasing line: '{}'", line),
        };
        let conditions = caps
            .name("conditions")
            .map(|x| {
                x.as_str()
                    .trim()
                    .split_whitespace()
                    .map(|c| c.to_string())
                    .collect()
            })
            .unwrap_or(vec![]);
        Ok(SpecialCaseMapping {
            codepoint: caps["codepoint"].parse()?,
            lowercase: parse_codepoint_sequence(&caps["lower"])?,
            titlecase: parse_codepoint_sequence(&caps["title"])?,
            uppercase: parse_codepoint_sequence(&caps["upper"])?,
            conditions,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::SpecialCaseMapping;

    #[test]
    fn parse_no_conds() {
        let line = "1F52; 1F52; 03A5 0313 0300; 03A5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA\n";
        let row: SpecialCaseMapping = line.parse().unwrap();
        assert_eq!(row.codepoint, 0x1F52);
        assert_eq!(row.lowercase, vec![0x1F52]);
        assert_eq!(row.titlecase, vec![0x03A5, 0x0313, 0x0300]);
        assert_eq!(row.uppercase, vec![0x03A5, 0x0313, 0x0300]);
        assert!(row.conditions.is_empty());
    }

    #[test]
    fn parse_conds() {
        let line = "0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE\n";
        let row: SpecialCaseMapping = line.parse().unwrap();
        assert_eq!(row.codepoint, 0x0307);
        assert!(row.lowercase.is_empty());
        assert_eq!(row.titlecase, vec![0x0307]);
        assert_eq!(row.uppercase, vec![0x0307]);
        assert_eq!(row.conditions, vec!["tr", "After_I"]);
    }
}