diff options
Diffstat (limited to 'third_party/rust/unic-langid-impl/src/lib.rs')
-rw-r--r-- | third_party/rust/unic-langid-impl/src/lib.rs | 516 |
1 files changed, 516 insertions, 0 deletions
diff --git a/third_party/rust/unic-langid-impl/src/lib.rs b/third_party/rust/unic-langid-impl/src/lib.rs new file mode 100644 index 0000000000..0a6d0b34af --- /dev/null +++ b/third_party/rust/unic-langid-impl/src/lib.rs @@ -0,0 +1,516 @@ +mod errors; +mod layout_table; +#[cfg(feature = "likelysubtags")] +pub mod likelysubtags; +#[doc(hidden)] +pub mod parser; +#[cfg(feature = "serde")] +mod serde; +pub mod subtags; + +pub use crate::errors::LanguageIdentifierError; +use std::fmt::Write; +use std::iter::Peekable; +use std::str::FromStr; + +/// Enum representing available character direction orientations. +#[derive(Debug, PartialEq)] +pub enum CharacterDirection { + /// Right To Left + /// + /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc. + RTL, + /// Left To Right + /// + /// Used in languages such as French, Spanish, English, German etc. + LTR, +} + +type PartsTuple = ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + Vec<subtags::Variant>, +); + +/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier. +/// +/// # Examples +/// +/// ``` +/// use unic_langid_impl::LanguageIdentifier; +/// +/// let li: LanguageIdentifier = "en-US".parse() +/// .expect("Failed to parse."); +/// +/// assert_eq!(li.language, "en"); +/// assert_eq!(li.script, None); +/// assert_eq!(li.region.as_ref().map(Into::into), Some("US")); +/// assert_eq!(li.variants().len(), 0); +/// ``` +/// +/// # Parsing +/// +/// Unicode recognizes three levels of standard conformance for any language identifier: +/// +/// * *well-formed* - syntactically correct +/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types... +/// * *canonical* - valid and no deprecated codes or structure. +/// +/// At the moment parsing normalizes a well-formed language identifier converting +/// `_` separators to `-` and adjusting casing to conform to the Unicode standard. +/// +/// Any bogus subtags will cause the parsing to fail with an error. +/// No subtag validation is performed. +/// +/// # Examples: +/// +/// ``` +/// use unic_langid_impl::LanguageIdentifier; +/// +/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse() +/// .expect("Failed to parse."); +/// +/// assert_eq!(li.language, "en"); +/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn")); +/// assert_eq!(li.region.as_ref().map(Into::into), Some("US")); +/// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]); +/// ``` +#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)] +pub struct LanguageIdentifier { + pub language: subtags::Language, + pub script: Option<subtags::Script>, + pub region: Option<subtags::Region>, + variants: Option<Box<[subtags::Variant]>>, +} + +impl LanguageIdentifier { + /// A constructor which takes a utf8 slice, parses it and + /// produces a well-formed `LanguageIdentifier`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let li = LanguageIdentifier::from_bytes("en-US".as_bytes()) + /// .expect("Parsing failed."); + /// + /// assert_eq!(li.to_string(), "en-US"); + /// ``` + pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> { + Ok(parser::parse_language_identifier(v)?) + } + + /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and + /// produces a well-formed `LanguageIdentifier`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let li = LanguageIdentifier::from_parts( + /// "fr".parse().expect("Parsing failed."), + /// None, + /// Some("CA".parse().expect("Parsing failed.")), + /// &[] + /// ); + /// + /// assert_eq!(li.to_string(), "fr-CA"); + /// ``` + pub fn from_parts( + language: subtags::Language, + script: Option<subtags::Script>, + region: Option<subtags::Region>, + variants: &[subtags::Variant], + ) -> Self { + let variants = if !variants.is_empty() { + let mut v = variants.to_vec(); + v.sort_unstable(); + v.dedup(); + Some(v.into_boxed_slice()) + } else { + None + }; + + Self { + language, + script, + region, + variants, + } + } + + /// # Unchecked + /// + /// This function accepts subtags expecting variants + /// to be deduplicated and ordered. + pub const fn from_raw_parts_unchecked( + language: subtags::Language, + script: Option<subtags::Script>, + region: Option<subtags::Region>, + variants: Option<Box<[subtags::Variant]>>, + ) -> Self { + Self { + language, + script, + region, + variants, + } + } + + #[doc(hidden)] + /// This method is used by `unic-locale` to handle partial + /// subtag iterator. + /// + /// Not stable. + pub fn try_from_iter<'a>( + iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>, + allow_extension: bool, + ) -> Result<LanguageIdentifier, LanguageIdentifierError> { + Ok(parser::parse_language_identifier_from_iter( + iter, + allow_extension, + )?) + } + + /// Consumes `LanguageIdentifier` and produces raw internal representations + /// of all subtags in form of `u64`/`u32`. + /// + /// Primarily used for storing internal representation and restoring via + /// `from_raw_parts_unchecked`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// use tinystr::{TinyStr8, TinyStr4}; + /// + /// let li: LanguageIdentifier = "en-US".parse() + /// .expect("Parsing failed."); + /// + /// let (lang, script, region, variants) = li.into_parts(); + /// + /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked( + /// // lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }), + /// // script.map(|s| unsafe { TinyStr4::new_unchecked(s) }), + /// // region.map(|r| unsafe { TinyStr4::new_unchecked(r) }), + /// // variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()), + /// //); + /// + /// //assert_eq!(li2.to_string(), "en-US"); + /// ``` + pub fn into_parts(self) -> PartsTuple { + ( + self.language, + self.script, + self.region, + self.variants.map_or_else(Vec::new, |v| v.to_vec()), + ) + } + + /// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier` + /// allowing for either side to use the missing fields as wildcards. + /// + /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let li1: LanguageIdentifier = "en".parse() + /// .expect("Parsing failed."); + /// + /// let li2: LanguageIdentifier = "en-US".parse() + /// .expect("Parsing failed."); + /// + /// assert_ne!(li1, li2); // "en" != "en-US" + /// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US" + /// + /// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US" + /// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US" + /// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*" + /// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*" + /// ``` + pub fn matches<O: AsRef<Self>>( + &self, + other: &O, + self_as_range: bool, + other_as_range: bool, + ) -> bool { + let other = other.as_ref(); + self.language + .matches(&other.language, self_as_range, other_as_range) + && subtag_matches(&self.script, &other.script, self_as_range, other_as_range) + && subtag_matches(&self.region, &other.region, self_as_range, other_as_range) + && subtags_match( + &self.variants, + &other.variants, + self_as_range, + other_as_range, + ) + } + + /// Returns a vector of variants subtags of the `LanguageIdentifier`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let li1: LanguageIdentifier = "ca-ES-valencia".parse() + /// .expect("Parsing failed."); + /// + /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]); + /// + /// let li2: LanguageIdentifier = "de".parse() + /// .expect("Parsing failed."); + /// + /// assert_eq!(li2.variants().len(), 0); + /// ``` + pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> { + let variants: &[_] = match self.variants { + Some(ref v) => &**v, + None => &[], + }; + + variants.iter() + } + + /// Sets variant subtags of the `LanguageIdentifier`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let mut li: LanguageIdentifier = "ca-ES".parse() + /// .expect("Parsing failed."); + /// + /// li.set_variants(&["valencia".parse().expect("Parsing failed.")]); + /// + /// assert_eq!(li.to_string(), "ca-ES-valencia"); + /// ``` + pub fn set_variants(&mut self, variants: &[subtags::Variant]) { + let mut v = variants.to_vec(); + + if v.is_empty() { + self.variants = None; + } else { + v.sort_unstable(); + v.dedup(); + self.variants = Some(v.into_boxed_slice()); + } + } + + /// Tests if a variant subtag is present in the `LanguageIdentifier`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let mut li: LanguageIdentifier = "ca-ES-macos".parse() + /// .expect("Parsing failed."); + /// + /// assert_eq!(li.has_variant("valencia".parse().unwrap()), false); + /// assert_eq!(li.has_variant("macos".parse().unwrap()), true); + /// ``` + pub fn has_variant(&self, variant: subtags::Variant) -> bool { + if let Some(variants) = &self.variants { + variants.contains(&variant) + } else { + false + } + } + + /// Clears variant subtags of the `LanguageIdentifier`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let mut li: LanguageIdentifier = "ca-ES-valencia".parse() + /// .expect("Parsing failed."); + /// + /// li.clear_variants(); + /// + /// assert_eq!(li.to_string(), "ca-ES"); + /// ``` + pub fn clear_variants(&mut self) { + self.variants = None; + } + + /// Extends the `LanguageIdentifier` adding likely subtags based + /// on tables provided by CLDR. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let mut li: LanguageIdentifier = "en-US".parse() + /// .expect("Parsing failed."); + /// + /// assert_eq!(li.maximize(), true); + /// assert_eq!(li.to_string(), "en-Latn-US"); + /// ``` + #[cfg(feature = "likelysubtags")] + pub fn maximize(&mut self) -> bool { + if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) { + self.language = new_li.0; + self.script = new_li.1; + self.region = new_li.2; + true + } else { + false + } + } + + /// Extends the `LanguageIdentifier` removing likely subtags based + /// on tables provided by CLDR. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::LanguageIdentifier; + /// + /// let mut li: LanguageIdentifier = "en-Latn-US".parse() + /// .expect("Parsing failed."); + /// + /// assert_eq!(li.minimize(), true); + /// assert_eq!(li.to_string(), "en"); + /// ``` + #[cfg(feature = "likelysubtags")] + pub fn minimize(&mut self) -> bool { + if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) { + self.language = new_li.0; + self.script = new_li.1; + self.region = new_li.2; + true + } else { + false + } + } + + /// Returns character direction of the `LanguageIdentifier`. + /// + /// # Examples + /// + /// ``` + /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection}; + /// + /// let li1: LanguageIdentifier = "es-AR".parse() + /// .expect("Parsing failed."); + /// let li2: LanguageIdentifier = "fa".parse() + /// .expect("Parsing failed."); + /// + /// assert_eq!(li1.character_direction(), CharacterDirection::LTR); + /// assert_eq!(li2.character_direction(), CharacterDirection::RTL); + /// ``` + pub fn character_direction(&self) -> CharacterDirection { + match (self.language.into(), self.script) { + (_, Some(script)) + if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) => + { + CharacterDirection::RTL + } + (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => { + CharacterDirection::RTL + } + _ => CharacterDirection::LTR, + } + } +} + +impl FromStr for LanguageIdentifier { + type Err = LanguageIdentifierError; + + fn from_str(source: &str) -> Result<Self, Self::Err> { + Self::from_bytes(source.as_bytes()) + } +} + +impl AsRef<LanguageIdentifier> for LanguageIdentifier { + #[inline(always)] + fn as_ref(&self) -> &LanguageIdentifier { + self + } +} + +impl std::fmt::Display for LanguageIdentifier { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + self.language.fmt(f)?; + if let Some(ref script) = self.script { + f.write_char('-')?; + script.fmt(f)?; + } + if let Some(ref region) = self.region { + f.write_char('-')?; + region.fmt(f)?; + } + if let Some(variants) = &self.variants { + for variant in variants.iter() { + f.write_char('-')?; + variant.fmt(f)?; + } + } + Ok(()) + } +} + +impl PartialEq<&str> for LanguageIdentifier { + fn eq(&self, other: &&str) -> bool { + self.to_string().as_str() == *other + } +} + +fn subtag_matches<P: PartialEq>( + subtag1: &Option<P>, + subtag2: &Option<P>, + as_range1: bool, + as_range2: bool, +) -> bool { + (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2 +} + +fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool { + subtag.as_ref().map_or(true, |t| t.is_empty()) +} + +fn subtags_match<P: PartialEq>( + subtag1: &Option<Box<[P]>>, + subtag2: &Option<Box<[P]>>, + as_range1: bool, + as_range2: bool, +) -> bool { + // or is some and is empty! + (as_range1 && is_option_empty(subtag1)) + || (as_range2 && is_option_empty(subtag2)) + || subtag1 == subtag2 +} + +/// This is a best-effort operation that performs all available levels of canonicalization. +/// +/// At the moment the operation will normalize casing and the separator, but in the future +/// it may also validate and update from deprecated subtags to canonical ones. +/// +/// # Examples +/// +/// ``` +/// use unic_langid_impl::canonicalize; +/// +/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string())); +/// ``` +pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> { + let lang_id = LanguageIdentifier::from_bytes(input.as_ref())?; + Ok(lang_id.to_string()) +} + +#[test] +fn invalid_subtag() { + assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err()); +} |