mod errors; mod layout_table; #[cfg(feature = "likelysubtags")] pub mod likelysubtags; #[doc(hidden)] pub mod parser; #[cfg(feature = "serde")] mod serde; pub mod subtags; pub use crate::errors::LanguageIdentifierError; use std::fmt::Write; use std::iter::Peekable; use std::str::FromStr; /// Enum representing available character direction orientations. #[derive(Debug, PartialEq)] pub enum CharacterDirection { /// Right To Left /// /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc. RTL, /// Left To Right /// /// Used in languages such as French, Spanish, English, German etc. LTR, } type PartsTuple = ( subtags::Language, Option, Option, Vec, ); /// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let li: LanguageIdentifier = "en-US".parse() /// .expect("Failed to parse."); /// /// assert_eq!(li.language, "en"); /// assert_eq!(li.script, None); /// assert_eq!(li.region.as_ref().map(Into::into), Some("US")); /// assert_eq!(li.variants().len(), 0); /// ``` /// /// # Parsing /// /// Unicode recognizes three levels of standard conformance for any language identifier: /// /// * *well-formed* - syntactically correct /// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types... /// * *canonical* - valid and no deprecated codes or structure. /// /// At the moment parsing normalizes a well-formed language identifier converting /// `_` separators to `-` and adjusting casing to conform to the Unicode standard. /// /// Any bogus subtags will cause the parsing to fail with an error. /// No subtag validation is performed. /// /// # Examples: /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse() /// .expect("Failed to parse."); /// /// assert_eq!(li.language, "en"); /// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn")); /// assert_eq!(li.region.as_ref().map(Into::into), Some("US")); /// assert_eq!(li.variants().map(|v| v.as_str()).collect::>(), &["valencia"]); /// ``` #[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)] pub struct LanguageIdentifier { pub language: subtags::Language, pub script: Option, pub region: Option, variants: Option>, } impl LanguageIdentifier { /// A constructor which takes a utf8 slice, parses it and /// produces a well-formed `LanguageIdentifier`. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let li = LanguageIdentifier::from_bytes("en-US".as_bytes()) /// .expect("Parsing failed."); /// /// assert_eq!(li.to_string(), "en-US"); /// ``` pub fn from_bytes(v: &[u8]) -> Result { Ok(parser::parse_language_identifier(v)?) } /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and /// produces a well-formed `LanguageIdentifier`. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let li = LanguageIdentifier::from_parts( /// "fr".parse().expect("Parsing failed."), /// None, /// Some("CA".parse().expect("Parsing failed.")), /// &[] /// ); /// /// assert_eq!(li.to_string(), "fr-CA"); /// ``` pub fn from_parts( language: subtags::Language, script: Option, region: Option, variants: &[subtags::Variant], ) -> Self { let variants = if !variants.is_empty() { let mut v = variants.to_vec(); v.sort_unstable(); v.dedup(); Some(v.into_boxed_slice()) } else { None }; Self { language, script, region, variants, } } /// # Unchecked /// /// This function accepts subtags expecting variants /// to be deduplicated and ordered. pub const fn from_raw_parts_unchecked( language: subtags::Language, script: Option, region: Option, variants: Option>, ) -> Self { Self { language, script, region, variants, } } #[doc(hidden)] /// This method is used by `unic-locale` to handle partial /// subtag iterator. /// /// Not stable. pub fn try_from_iter<'a>( iter: &mut Peekable>, allow_extension: bool, ) -> Result { Ok(parser::parse_language_identifier_from_iter( iter, allow_extension, )?) } /// Consumes `LanguageIdentifier` and produces raw internal representations /// of all subtags in form of `u64`/`u32`. /// /// Primarily used for storing internal representation and restoring via /// `from_raw_parts_unchecked`. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// use tinystr::{TinyStr8, TinyStr4}; /// /// let li: LanguageIdentifier = "en-US".parse() /// .expect("Parsing failed."); /// /// let (lang, script, region, variants) = li.into_parts(); /// /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked( /// // lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }), /// // script.map(|s| unsafe { TinyStr4::new_unchecked(s) }), /// // region.map(|r| unsafe { TinyStr4::new_unchecked(r) }), /// // variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()), /// //); /// /// //assert_eq!(li2.to_string(), "en-US"); /// ``` pub fn into_parts(self) -> PartsTuple { ( self.language, self.script, self.region, self.variants.map_or_else(Vec::new, |v| v.to_vec()), ) } /// Compares a `LanguageIdentifier` to another `AsRef>( &self, other: &O, self_as_range: bool, other_as_range: bool, ) -> bool { let other = other.as_ref(); self.language .matches(&other.language, self_as_range, other_as_range) && subtag_matches(&self.script, &other.script, self_as_range, other_as_range) && subtag_matches(&self.region, &other.region, self_as_range, other_as_range) && subtags_match( &self.variants, &other.variants, self_as_range, other_as_range, ) } /// Returns a vector of variants subtags of the `LanguageIdentifier`. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let li1: LanguageIdentifier = "ca-ES-valencia".parse() /// .expect("Parsing failed."); /// /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::>(), &["valencia"]); /// /// let li2: LanguageIdentifier = "de".parse() /// .expect("Parsing failed."); /// /// assert_eq!(li2.variants().len(), 0); /// ``` pub fn variants(&self) -> impl ExactSizeIterator { let variants: &[_] = match self.variants { Some(ref v) => &**v, None => &[], }; variants.iter() } /// Sets variant subtags of the `LanguageIdentifier`. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let mut li: LanguageIdentifier = "ca-ES".parse() /// .expect("Parsing failed."); /// /// li.set_variants(&["valencia".parse().expect("Parsing failed.")]); /// /// assert_eq!(li.to_string(), "ca-ES-valencia"); /// ``` pub fn set_variants(&mut self, variants: &[subtags::Variant]) { let mut v = variants.to_vec(); if v.is_empty() { self.variants = None; } else { v.sort_unstable(); v.dedup(); self.variants = Some(v.into_boxed_slice()); } } /// Tests if a variant subtag is present in the `LanguageIdentifier`. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let mut li: LanguageIdentifier = "ca-ES-macos".parse() /// .expect("Parsing failed."); /// /// assert_eq!(li.has_variant("valencia".parse().unwrap()), false); /// assert_eq!(li.has_variant("macos".parse().unwrap()), true); /// ``` pub fn has_variant(&self, variant: subtags::Variant) -> bool { if let Some(variants) = &self.variants { variants.contains(&variant) } else { false } } /// Clears variant subtags of the `LanguageIdentifier`. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let mut li: LanguageIdentifier = "ca-ES-valencia".parse() /// .expect("Parsing failed."); /// /// li.clear_variants(); /// /// assert_eq!(li.to_string(), "ca-ES"); /// ``` pub fn clear_variants(&mut self) { self.variants = None; } /// Extends the `LanguageIdentifier` adding likely subtags based /// on tables provided by CLDR. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let mut li: LanguageIdentifier = "en-US".parse() /// .expect("Parsing failed."); /// /// assert_eq!(li.maximize(), true); /// assert_eq!(li.to_string(), "en-Latn-US"); /// ``` #[cfg(feature = "likelysubtags")] pub fn maximize(&mut self) -> bool { if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) { self.language = new_li.0; self.script = new_li.1; self.region = new_li.2; true } else { false } } /// Extends the `LanguageIdentifier` removing likely subtags based /// on tables provided by CLDR. /// /// # Examples /// /// ``` /// use unic_langid_impl::LanguageIdentifier; /// /// let mut li: LanguageIdentifier = "en-Latn-US".parse() /// .expect("Parsing failed."); /// /// assert_eq!(li.minimize(), true); /// assert_eq!(li.to_string(), "en"); /// ``` #[cfg(feature = "likelysubtags")] pub fn minimize(&mut self) -> bool { if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) { self.language = new_li.0; self.script = new_li.1; self.region = new_li.2; true } else { false } } /// Returns character direction of the `LanguageIdentifier`. /// /// # Examples /// /// ``` /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection}; /// /// let li1: LanguageIdentifier = "es-AR".parse() /// .expect("Parsing failed."); /// let li2: LanguageIdentifier = "fa".parse() /// .expect("Parsing failed."); /// /// assert_eq!(li1.character_direction(), CharacterDirection::LTR); /// assert_eq!(li2.character_direction(), CharacterDirection::RTL); /// ``` pub fn character_direction(&self) -> CharacterDirection { match (self.language.into(), self.script) { (_, Some(script)) if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) => { CharacterDirection::RTL } (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => { CharacterDirection::RTL } _ => CharacterDirection::LTR, } } } impl FromStr for LanguageIdentifier { type Err = LanguageIdentifierError; fn from_str(source: &str) -> Result { Self::from_bytes(source.as_bytes()) } } impl AsRef for LanguageIdentifier { #[inline(always)] fn as_ref(&self) -> &LanguageIdentifier { self } } impl std::fmt::Display for LanguageIdentifier { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { self.language.fmt(f)?; if let Some(ref script) = self.script { f.write_char('-')?; script.fmt(f)?; } if let Some(ref region) = self.region { f.write_char('-')?; region.fmt(f)?; } if let Some(variants) = &self.variants { for variant in variants.iter() { f.write_char('-')?; variant.fmt(f)?; } } Ok(()) } } impl PartialEq<&str> for LanguageIdentifier { fn eq(&self, other: &&str) -> bool { self.to_string().as_str() == *other } } fn subtag_matches( subtag1: &Option

, subtag2: &Option

, as_range1: bool, as_range2: bool, ) -> bool { (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2 } fn is_option_empty(subtag: &Option>) -> bool { subtag.as_ref().map_or(true, |t| t.is_empty()) } fn subtags_match( subtag1: &Option>, subtag2: &Option>, as_range1: bool, as_range2: bool, ) -> bool { // or is some and is empty! (as_range1 && is_option_empty(subtag1)) || (as_range2 && is_option_empty(subtag2)) || subtag1 == subtag2 } /// This is a best-effort operation that performs all available levels of canonicalization. /// /// At the moment the operation will normalize casing and the separator, but in the future /// it may also validate and update from deprecated subtags to canonical ones. /// /// # Examples /// /// ``` /// use unic_langid_impl::canonicalize; /// /// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string())); /// ``` pub fn canonicalize>(input: S) -> Result { let lang_id = LanguageIdentifier::from_bytes(input.as_ref())?; Ok(lang_id.to_string()) } #[test] fn invalid_subtag() { assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err()); }