use alloc::{ string::{String, ToString}, vec::Vec, }; use crate::hir; /// An inclusive range of codepoints from a generated file (hence the static /// lifetime). type Range = &'static [(char, char)]; /// An error that occurs when dealing with Unicode. /// /// We don't impl the Error trait here because these always get converted /// into other public errors. (This error type isn't exported.) #[derive(Debug)] pub enum Error { PropertyNotFound, PropertyValueNotFound, // Not used when unicode-perl is enabled. #[allow(dead_code)] PerlClassNotFound, } /// An error that occurs when Unicode-aware simple case folding fails. /// /// This error can occur when the case mapping tables necessary for Unicode /// aware case folding are unavailable. This only occurs when the /// `unicode-case` feature is disabled. (The feature is enabled by default.) #[derive(Debug)] pub struct CaseFoldError(()); #[cfg(feature = "std")] impl std::error::Error for CaseFoldError {} impl core::fmt::Display for CaseFoldError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware case folding is not available \ (probably because the unicode-case feature is not enabled)" ) } } /// An error that occurs when the Unicode-aware `\w` class is unavailable. /// /// This error can occur when the data tables necessary for the Unicode aware /// Perl character class `\w` are unavailable. This only occurs when the /// `unicode-perl` feature is disabled. (The feature is enabled by default.) #[derive(Debug)] pub struct UnicodeWordError(()); #[cfg(feature = "std")] impl std::error::Error for UnicodeWordError {} impl core::fmt::Display for UnicodeWordError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware \\w class is not available \ (probably because the unicode-perl feature is not enabled)" ) } } /// A state oriented traverser of the simple case folding table. /// /// A case folder can be constructed via `SimpleCaseFolder::new()`, which will /// return an error if the underlying case folding table is unavailable. /// /// After construction, it is expected that callers will use /// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly /// increasing order. For example, calling it on `b` and then on `a` is illegal /// and will result in a panic. /// /// The main idea of this type is that it tries hard to make mapping lookups /// fast by exploiting the structure of the underlying table, and the ordering /// assumption enables this. #[derive(Debug)] pub struct SimpleCaseFolder { /// The simple case fold table. It's a sorted association list, where the /// keys are Unicode scalar values and the values are the corresponding /// equivalence class (not including the key) of the "simple" case folded /// Unicode scalar values. table: &'static [(char, &'static [char])], /// The last codepoint that was used for a lookup. last: Option, /// The index to the entry in `table` corresponding to the smallest key `k` /// such that `k > k0`, where `k0` is the most recent key lookup. Note that /// in particular, `k0` may not be in the table! next: usize, } impl SimpleCaseFolder { /// Create a new simple case folder, returning an error if the underlying /// case folding table is unavailable. pub fn new() -> Result { #[cfg(not(feature = "unicode-case"))] { Err(CaseFoldError(())) } #[cfg(feature = "unicode-case")] { Ok(SimpleCaseFolder { table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, last: None, next: 0, }) } } /// Return the equivalence class of case folded codepoints for the given /// codepoint. The equivalence class returned never includes the codepoint /// given. If the given codepoint has no case folded codepoints (i.e., /// no entry in the underlying case folding table), then this returns an /// empty slice. /// /// # Panics /// /// This panics when called with a `c` that is less than or equal to the /// previous call. In other words, callers need to use this method with /// strictly increasing values of `c`. pub fn mapping(&mut self, c: char) -> &'static [char] { if let Some(last) = self.last { assert!( last < c, "got codepoint U+{:X} which occurs before \ last codepoint U+{:X}", u32::from(c), u32::from(last), ); } self.last = Some(c); if self.next >= self.table.len() { return &[]; } let (k, v) = self.table[self.next]; if k == c { self.next += 1; return v; } match self.get(c) { Err(i) => { self.next = i; &[] } Ok(i) => { // Since we require lookups to proceed // in order, anything we find should be // after whatever we thought might be // next. Otherwise, the caller is either // going out of order or we would have // found our next key at 'self.next'. assert!(i > self.next); self.next = i + 1; self.table[i].1 } } } /// Returns true if and only if the given range overlaps with any region /// of the underlying case folding table. That is, when true, there exists /// at least one codepoint in the inclusive range `[start, end]` that has /// a non-trivial equivalence class of case folded codepoints. Conversely, /// when this returns false, all codepoints in the range `[start, end]` /// correspond to the trivial equivalence class of case folded codepoints, /// i.e., itself. /// /// This is useful to call before iterating over the codepoints in the /// range and looking up the mapping for each. If you know none of the /// mappings will return anything, then you might be able to skip doing it /// altogether. /// /// # Panics /// /// This panics when `end < start`. pub fn overlaps(&self, start: char, end: char) -> bool { use core::cmp::Ordering; assert!(start <= end); self.table .binary_search_by(|&(c, _)| { if start <= c && c <= end { Ordering::Equal } else if c > end { Ordering::Greater } else { Ordering::Less } }) .is_ok() } /// Returns the index at which `c` occurs in the simple case fold table. If /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < /// c` and `table[i].0 > c`. fn get(&self, c: char) -> Result { self.table.binary_search_by_key(&c, |&(c1, _)| c1) } } /// A query for finding a character class defined by Unicode. This supports /// either use of a property name directly, or lookup by property value. The /// former generally refers to Binary properties (see UTS#44, Table 8), but /// as a special exception (see UTS#18, Section 1.2) both general categories /// (an enumeration) and scripts (a catalog) are supported as if each of their /// possible values were a binary property. /// /// In all circumstances, property names and values are normalized and /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. /// /// The lifetime `'a` refers to the shorter of the lifetimes of property name /// and property value. #[derive(Debug)] pub enum ClassQuery<'a> { /// Return a class corresponding to a Unicode binary property, named by /// a single letter. OneLetter(char), /// Return a class corresponding to a Unicode binary property. /// /// Note that, by special exception (see UTS#18, Section 1.2), both /// general category values and script values are permitted here as if /// they were a binary property. Binary(&'a str), /// Return a class corresponding to all codepoints whose property /// (identified by `property_name`) corresponds to the given value /// (identified by `property_value`). ByValue { /// A property name. property_name: &'a str, /// A property value. property_value: &'a str, }, } impl<'a> ClassQuery<'a> { fn canonicalize(&self) -> Result { match *self { ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), ClassQuery::Binary(name) => self.canonical_binary(name), ClassQuery::ByValue { property_name, property_value } => { let property_name = symbolic_name_normalize(property_name); let property_value = symbolic_name_normalize(property_value); let canon_name = match canonical_prop(&property_name)? { None => return Err(Error::PropertyNotFound), Some(canon_name) => canon_name, }; Ok(match canon_name { "General_Category" => { let canon = match canonical_gencat(&property_value)? { None => return Err(Error::PropertyValueNotFound), Some(canon) => canon, }; CanonicalClassQuery::GeneralCategory(canon) } "Script" => { let canon = match canonical_script(&property_value)? { None => return Err(Error::PropertyValueNotFound), Some(canon) => canon, }; CanonicalClassQuery::Script(canon) } _ => { let vals = match property_values(canon_name)? { None => return Err(Error::PropertyValueNotFound), Some(vals) => vals, }; let canon_val = match canonical_value(vals, &property_value) { None => { return Err(Error::PropertyValueNotFound) } Some(canon_val) => canon_val, }; CanonicalClassQuery::ByValue { property_name: canon_name, property_value: canon_val, } } }) } } } fn canonical_binary( &self, name: &str, ) -> Result { let norm = symbolic_name_normalize(name); // This is a special case where 'cf' refers to the 'Format' general // category, but where the 'cf' abbreviation is also an abbreviation // for the 'Case_Folding' property. But we want to treat it as // a general category. (Currently, we don't even support the // 'Case_Folding' property. But if we do in the future, users will be // required to spell it out.) // // Also 'sc' refers to the 'Currency_Symbol' general category, but is // also the abbreviation for the 'Script' property. So we avoid calling // 'canonical_prop' for it too, which would erroneously normalize it // to 'Script'. // // Another case: 'lc' is an abbreviation for the 'Cased_Letter' // general category, but is also an abbreviation for the 'Lowercase_Mapping' // property. We don't currently support the latter, so as with 'cf' // above, we treat 'lc' as 'Cased_Letter'. if norm != "cf" && norm != "sc" && norm != "lc" { if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } } if let Some(canon) = canonical_gencat(&norm)? { return Ok(CanonicalClassQuery::GeneralCategory(canon)); } if let Some(canon) = canonical_script(&norm)? { return Ok(CanonicalClassQuery::Script(canon)); } Err(Error::PropertyNotFound) } } /// Like ClassQuery, but its parameters have been canonicalized. This also /// differentiates binary properties from flattened general categories and /// scripts. #[derive(Debug, Eq, PartialEq)] enum CanonicalClassQuery { /// The canonical binary property name. Binary(&'static str), /// The canonical general category name. GeneralCategory(&'static str), /// The canonical script name. Script(&'static str), /// An arbitrary association between property and value, both of which /// have been canonicalized. /// /// Note that by construction, the property name of ByValue will never /// be General_Category or Script. Those two cases are subsumed by the /// eponymous variants. ByValue { /// The canonical property name. property_name: &'static str, /// The canonical property value. property_value: &'static str, }, } /// Looks up a Unicode class given a query. If one doesn't exist, then /// `None` is returned. pub fn class(query: ClassQuery<'_>) -> Result { use self::CanonicalClassQuery::*; match query.canonicalize()? { Binary(name) => bool_property(name), GeneralCategory(name) => gencat(name), Script(name) => script(name), ByValue { property_name: "Age", property_value } => { let mut class = hir::ClassUnicode::empty(); for set in ages(property_value)? { class.union(&hir_class(set)); } Ok(class) } ByValue { property_name: "Script_Extensions", property_value } => { script_extension(property_value) } ByValue { property_name: "Grapheme_Cluster_Break", property_value, } => gcb(property_value), ByValue { property_name: "Sentence_Break", property_value } => { sb(property_value) } ByValue { property_name: "Word_Break", property_value } => { wb(property_value) } _ => { // What else should we support? Err(Error::PropertyNotFound) } } } /// Returns a Unicode aware class for \w. /// /// This returns an error if the data is not available for \w. pub fn perl_word() -> Result { #[cfg(not(feature = "unicode-perl"))] fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(feature = "unicode-perl")] fn imp() -> Result { use crate::unicode_tables::perl_word::PERL_WORD; Ok(hir_class(PERL_WORD)) } imp() } /// Returns a Unicode aware class for \s. /// /// This returns an error if the data is not available for \s. pub fn perl_space() -> Result { #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] fn imp() -> Result { use crate::unicode_tables::perl_space::WHITE_SPACE; Ok(hir_class(WHITE_SPACE)) } #[cfg(feature = "unicode-bool")] fn imp() -> Result { use crate::unicode_tables::property_bool::WHITE_SPACE; Ok(hir_class(WHITE_SPACE)) } imp() } /// Returns a Unicode aware class for \d. /// /// This returns an error if the data is not available for \d. pub fn perl_digit() -> Result { #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] fn imp() -> Result { use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; Ok(hir_class(DECIMAL_NUMBER)) } #[cfg(feature = "unicode-gencat")] fn imp() -> Result { use crate::unicode_tables::general_category::DECIMAL_NUMBER; Ok(hir_class(DECIMAL_NUMBER)) } imp() } /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { let hir_ranges: Vec = ranges .iter() .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) .collect(); hir::ClassUnicode::new(hir_ranges) } /// Returns true only if the given codepoint is in the `\w` character class. /// /// If the `unicode-perl` feature is not enabled, then this returns an error. pub fn is_word_character(c: char) -> Result { #[cfg(not(feature = "unicode-perl"))] fn imp(_: char) -> Result { Err(UnicodeWordError(())) } #[cfg(feature = "unicode-perl")] fn imp(c: char) -> Result { use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; if u8::try_from(c).map_or(false, is_word_byte) { return Ok(true); } Ok(PERL_WORD .binary_search_by(|&(start, end)| { use core::cmp::Ordering; if start <= c && c <= end { Ordering::Equal } else if start > c { Ordering::Greater } else { Ordering::Less } }) .is_ok()) } imp(c) } /// A mapping of property values for a specific property. /// /// The first element of each tuple is a normalized property value while the /// second element of each tuple is the corresponding canonical property /// value. type PropertyValues = &'static [(&'static str, &'static str)]; fn canonical_gencat( normalized_value: &str, ) -> Result, Error> { Ok(match normalized_value { "any" => Some("Any"), "assigned" => Some("Assigned"), "ascii" => Some("ASCII"), _ => { let gencats = property_values("General_Category")?.unwrap(); canonical_value(gencats, normalized_value) } }) } fn canonical_script( normalized_value: &str, ) -> Result, Error> { let scripts = property_values("Script")?.unwrap(); Ok(canonical_value(scripts, normalized_value)) } /// Find the canonical property name for the given normalized property name. /// /// If no such property exists, then `None` is returned. /// /// The normalized property name must have been normalized according to /// UAX44 LM3, which can be done using `symbolic_name_normalize`. /// /// If the property names data is not available, then an error is returned. fn canonical_prop( normalized_name: &str, ) -> Result, Error> { #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", )))] fn imp(_: &str) -> Result, Error> { Err(Error::PropertyNotFound) } #[cfg(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", ))] fn imp(name: &str) -> Result, Error> { use crate::unicode_tables::property_names::PROPERTY_NAMES; Ok(PROPERTY_NAMES .binary_search_by_key(&name, |&(n, _)| n) .ok() .map(|i| PROPERTY_NAMES[i].1)) } imp(normalized_name) } /// Find the canonical property value for the given normalized property /// value. /// /// The given property values should correspond to the values for the property /// under question, which can be found using `property_values`. /// /// If no such property value exists, then `None` is returned. /// /// The normalized property value must have been normalized according to /// UAX44 LM3, which can be done using `symbolic_name_normalize`. fn canonical_value( vals: PropertyValues, normalized_value: &str, ) -> Option<&'static str> { vals.binary_search_by_key(&normalized_value, |&(n, _)| n) .ok() .map(|i| vals[i].1) } /// Return the table of property values for the given property name. /// /// If the property values data is not available, then an error is returned. fn property_values( canonical_property_name: &'static str, ) -> Result, Error> { #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", )))] fn imp(_: &'static str) -> Result, Error> { Err(Error::PropertyValueNotFound) } #[cfg(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", ))] fn imp(name: &'static str) -> Result, Error> { use crate::unicode_tables::property_values::PROPERTY_VALUES; Ok(PROPERTY_VALUES .binary_search_by_key(&name, |&(n, _)| n) .ok() .map(|i| PROPERTY_VALUES[i].1)) } imp(canonical_property_name) } // This is only used in some cases, but small enough to just let it be dead // instead of figuring out (and maintaining) the right set of features. #[allow(dead_code)] fn property_set( name_map: &'static [(&'static str, Range)], canonical: &'static str, ) -> Option { name_map .binary_search_by_key(&canonical, |x| x.0) .ok() .map(|i| name_map[i].1) } /// Returns an iterator over Unicode Age sets. Each item corresponds to a set /// of codepoints that were added in a particular revision of Unicode. The /// iterator yields items in chronological order. /// /// If the given age value isn't valid or if the data isn't available, then an /// error is returned instead. fn ages(canonical_age: &str) -> Result, Error> { #[cfg(not(feature = "unicode-age"))] fn imp(_: &str) -> Result, Error> { use core::option::IntoIter; Err::, _>(Error::PropertyNotFound) } #[cfg(feature = "unicode-age")] fn imp(canonical_age: &str) -> Result, Error> { use crate::unicode_tables::age; const AGES: &[(&str, Range)] = &[ ("V1_1", age::V1_1), ("V2_0", age::V2_0), ("V2_1", age::V2_1), ("V3_0", age::V3_0), ("V3_1", age::V3_1), ("V3_2", age::V3_2), ("V4_0", age::V4_0), ("V4_1", age::V4_1), ("V5_0", age::V5_0), ("V5_1", age::V5_1), ("V5_2", age::V5_2), ("V6_0", age::V6_0), ("V6_1", age::V6_1), ("V6_2", age::V6_2), ("V6_3", age::V6_3), ("V7_0", age::V7_0), ("V8_0", age::V8_0), ("V9_0", age::V9_0), ("V10_0", age::V10_0), ("V11_0", age::V11_0), ("V12_0", age::V12_0), ("V12_1", age::V12_1), ("V13_0", age::V13_0), ("V14_0", age::V14_0), ("V15_0", age::V15_0), ]; assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); let pos = AGES.iter().position(|&(age, _)| canonical_age == age); match pos { None => Err(Error::PropertyValueNotFound), Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), } } imp(canonical_age) } /// Returns the Unicode HIR class corresponding to the given general category. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given general category could not be found, or if the general /// category data is not available, then an error is returned. fn gencat(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-gencat"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-gencat")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::general_category::BY_NAME; match name { "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), "Assigned" => { let mut cls = gencat("Unassigned")?; cls.negate(); Ok(cls) } name => property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound), } } match canonical_name { "Decimal_Number" => perl_digit(), name => imp(name), } } /// Returns the Unicode HIR class corresponding to the given script. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given script could not be found, or if the script data is not /// available, then an error is returned. fn script(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-script"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-script")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::script::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Returns the Unicode HIR class corresponding to the given script extension. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given script extension could not be found, or if the script data is /// not available, then an error is returned. fn script_extension( canonical_name: &'static str, ) -> Result { #[cfg(not(feature = "unicode-script"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-script")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::script_extension::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Returns the Unicode HIR class corresponding to the given Unicode boolean /// property. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given boolean property could not be found, or if the boolean /// property data is not available, then an error is returned. fn bool_property( canonical_name: &'static str, ) -> Result { #[cfg(not(feature = "unicode-bool"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-bool")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::property_bool::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyNotFound) } match canonical_name { "Decimal_Number" => perl_digit(), "White_Space" => perl_space(), name => imp(name), } } /// Returns the Unicode HIR class corresponding to the given grapheme cluster /// break property. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. fn gcb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::grapheme_cluster_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Returns the Unicode HIR class corresponding to the given word break /// property. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. fn wb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::word_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Returns the Unicode HIR class corresponding to the given sentence /// break property. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. fn sb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::sentence_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Like symbolic_name_normalize_bytes, but operates on a string. fn symbolic_name_normalize(x: &str) -> String { let mut tmp = x.as_bytes().to_vec(); let len = symbolic_name_normalize_bytes(&mut tmp).len(); tmp.truncate(len); // This should always succeed because `symbolic_name_normalize_bytes` // guarantees that `&tmp[..len]` is always valid UTF-8. // // N.B. We could avoid the additional UTF-8 check here, but it's unlikely // to be worth skipping the additional safety check. A benchmark must // justify it first. String::from_utf8(tmp).unwrap() } /// Normalize the given symbolic name in place according to UAX44-LM3. /// /// A "symbolic name" typically corresponds to property names and property /// value aliases. Note, though, that it should not be applied to property /// string values. /// /// The slice returned is guaranteed to be valid UTF-8 for all possible values /// of `slice`. /// /// See: https://unicode.org/reports/tr44/#UAX44-LM3 fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { // I couldn't find a place in the standard that specified that property // names/aliases had a particular structure (unlike character names), but // we assume that it's ASCII only and drop anything that isn't ASCII. let mut start = 0; let mut starts_with_is = false; if slice.len() >= 2 { // Ignore any "is" prefix. starts_with_is = slice[0..2] == b"is"[..] || slice[0..2] == b"IS"[..] || slice[0..2] == b"iS"[..] || slice[0..2] == b"Is"[..]; if starts_with_is { start = 2; } } let mut next_write = 0; for i in start..slice.len() { // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid // UTF-8, we ensure that the slice contains only ASCII bytes. In // particular, we drop every non-ASCII byte from the normalized string. let b = slice[i]; if b == b' ' || b == b'_' || b == b'-' { continue; } else if b'A' <= b && b <= b'Z' { slice[next_write] = b + (b'a' - b'A'); next_write += 1; } else if b <= 0x7F { slice[next_write] = b; next_write += 1; } } // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it // is actually an alias for the 'Other' general category. if starts_with_is && next_write == 1 && slice[0] == b'c' { slice[0] = b'i'; slice[1] = b's'; slice[2] = b'c'; next_write = 3; } &mut slice[..next_write] } #[cfg(test)] mod tests { use super::*; #[cfg(feature = "unicode-case")] fn simple_fold_ok(c: char) -> impl Iterator { SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() } #[cfg(feature = "unicode-case")] fn contains_case_map(start: char, end: char) -> bool { SimpleCaseFolder::new().unwrap().overlaps(start, end) } #[test] #[cfg(feature = "unicode-case")] fn simple_fold_k() { let xs: Vec = simple_fold_ok('k').collect(); assert_eq!(xs, alloc::vec!['K', 'K']); let xs: Vec = simple_fold_ok('K').collect(); assert_eq!(xs, alloc::vec!['k', 'K']); let xs: Vec = simple_fold_ok('K').collect(); assert_eq!(xs, alloc::vec!['K', 'k']); } #[test] #[cfg(feature = "unicode-case")] fn simple_fold_a() { let xs: Vec = simple_fold_ok('a').collect(); assert_eq!(xs, alloc::vec!['A']); let xs: Vec = simple_fold_ok('A').collect(); assert_eq!(xs, alloc::vec!['a']); } #[test] #[cfg(not(feature = "unicode-case"))] fn simple_fold_disabled() { assert!(SimpleCaseFolder::new().is_err()); } #[test] #[cfg(feature = "unicode-case")] fn range_contains() { assert!(contains_case_map('A', 'A')); assert!(contains_case_map('Z', 'Z')); assert!(contains_case_map('A', 'Z')); assert!(contains_case_map('@', 'A')); assert!(contains_case_map('Z', '[')); assert!(contains_case_map('☃', 'Ⰰ')); assert!(!contains_case_map('[', '[')); assert!(!contains_case_map('[', '`')); assert!(!contains_case_map('☃', '☃')); } #[test] #[cfg(feature = "unicode-gencat")] fn regression_466() { use super::{CanonicalClassQuery, ClassQuery}; let q = ClassQuery::OneLetter('C'); assert_eq!( q.canonicalize().unwrap(), CanonicalClassQuery::GeneralCategory("Other") ); } #[test] fn sym_normalize() { let sym_norm = symbolic_name_normalize; assert_eq!(sym_norm("Line_Break"), "linebreak"); assert_eq!(sym_norm("Line-break"), "linebreak"); assert_eq!(sym_norm("linebreak"), "linebreak"); assert_eq!(sym_norm("BA"), "ba"); assert_eq!(sym_norm("ba"), "ba"); assert_eq!(sym_norm("Greek"), "greek"); assert_eq!(sym_norm("isGreek"), "greek"); assert_eq!(sym_norm("IS_Greek"), "greek"); assert_eq!(sym_norm("isc"), "isc"); assert_eq!(sym_norm("is c"), "isc"); assert_eq!(sym_norm("is_c"), "isc"); } #[test] fn valid_utf8_symbolic() { let mut x = b"abc\xFFxyz".to_vec(); let y = symbolic_name_normalize_bytes(&mut x); assert_eq!(y, b"abcxyz"); } }