// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use core::cmp::Ordering; use core::str::FromStr; use crate::ordering::SubtagOrderingResult; use crate::parser::{ parse_language_identifier, parse_language_identifier_with_single_variant, ParserError, ParserMode, SubtagIterator, }; use crate::subtags; use alloc::string::String; use writeable::Writeable; /// A core struct representing a [`Unicode BCP47 Language Identifier`]. /// /// # Examples /// /// ``` /// use icu::locid::{ /// langid, subtags_language as language, subtags_region as region, /// }; /// /// let li = langid!("en-US"); /// /// assert_eq!(li.language, language!("en")); /// assert_eq!(li.script, None); /// assert_eq!(li.region, Some(region!("US"))); /// assert_eq!(li.variants.len(), 0); /// ``` /// /// # Parsing /// /// Unicode recognizes three levels of standard conformance for any language identifier: /// /// * *well-formed* - syntactically correct /// * *valid* - well-formed and only uses registered language, region, script and variant subtags... /// * *canonical* - valid and no deprecated codes or structure. /// /// At the moment parsing normalizes a well-formed language identifier converting /// `_` separators to `-` and adjusting casing to conform to the Unicode standard. /// /// Any bogus subtags will cause the parsing to fail with an error. /// No subtag validation is performed. /// /// # Examples /// /// ``` /// use icu::locid::{ /// langid, subtags_language as language, subtags_region as region, /// subtags_script as script, subtags_variant as variant, /// }; /// /// let li = langid!("eN_latn_Us-Valencia"); /// /// assert_eq!(li.language, language!("en")); /// assert_eq!(li.script, Some(script!("Latn"))); /// assert_eq!(li.region, Some(region!("US"))); /// assert_eq!(li.variants.get(0), Some(&variant!("valencia"))); /// ``` /// /// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier #[derive(Default, PartialEq, Eq, Clone, Hash)] #[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro) pub struct LanguageIdentifier { /// Language subtag of the language identifier. pub language: subtags::Language, /// Script subtag of the language identifier. pub script: Option, /// Region subtag of the language identifier. pub region: Option, /// Variant subtags of the language identifier. pub variants: subtags::Variants, } impl LanguageIdentifier { /// A constructor which takes a utf8 slice, parses it and /// produces a well-formed [`LanguageIdentifier`]. /// /// # Examples /// /// ``` /// use icu::locid::LanguageIdentifier; /// /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed"); /// ``` pub fn try_from_bytes(v: &[u8]) -> Result { parse_language_identifier(v, ParserMode::LanguageIdentifier) } #[doc(hidden)] #[allow(clippy::type_complexity)] // The return type should be `Result` once the `const_precise_live_drops` // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)). pub const fn try_from_bytes_with_single_variant( v: &[u8], ) -> Result< ( subtags::Language, Option, Option, Option, ), ParserError, > { parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier) } /// A constructor which takes a utf8 slice which may contain extension keys, /// parses it and produces a well-formed [`LanguageIdentifier`]. /// /// # Examples /// /// ``` /// use icu::locid::{langid, LanguageIdentifier}; /// /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix") /// .expect("Parsing failed."); /// /// assert_eq!(li, langid!("en-US")); /// ``` /// /// This method should be used for input that may be a locale identifier. /// All extensions will be lost. pub fn try_from_locale_bytes(v: &[u8]) -> Result { parse_language_identifier(v, ParserMode::Locale) } /// The default undefined language "und". Same as [`default()`](Default::default()). /// /// # Examples /// /// ``` /// use icu::locid::LanguageIdentifier; /// /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND); /// ``` pub const UND: Self = Self { language: subtags::Language::UND, script: None, region: None, variants: subtags::Variants::new(), }; /// This is a best-effort operation that performs all available levels of canonicalization. /// /// At the moment the operation will normalize casing and the separator, but in the future /// it may also validate and update from deprecated subtags to canonical ones. /// /// # Examples /// /// ``` /// use icu::locid::LanguageIdentifier; /// /// assert_eq!( /// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(), /// Ok("pl-Latn-PL") /// ); /// ``` pub fn canonicalize>(input: S) -> Result { let lang_id = Self::try_from_bytes(input.as_ref())?; Ok(lang_id.write_to_string().into_owned()) } /// Compare this [`LanguageIdentifier`] with BCP-47 bytes. /// /// The return value is equivalent to what would happen if you first converted this /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison. /// /// This function is case-sensitive and results in a *total order*, so it is appropriate for /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. /// /// # Examples /// /// ``` /// use icu::locid::LanguageIdentifier; /// use std::cmp::Ordering; /// /// let bcp47_strings: &[&str] = &[ /// "pl-Latn-PL", /// "und", /// "und-Adlm", /// "und-GB", /// "und-ZA", /// "und-fonipa", /// "zh", /// ]; /// /// for ab in bcp47_strings.windows(2) { /// let a = ab[0]; /// let b = ab[1]; /// assert!(a.cmp(b) == Ordering::Less); /// let a_langid = a.parse::().unwrap(); /// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal); /// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less); /// } /// ``` pub fn strict_cmp(&self, other: &[u8]) -> Ordering { self.strict_cmp_iter(other.split(|b| *b == b'-')).end() } /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags. /// /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as /// a more modular version that allows multiple subtag iterators to be chained together. /// /// For an additional example, see [`SubtagOrderingResult`]. /// /// # Examples /// /// ``` /// use icu::locid::LanguageIdentifier; /// use std::cmp::Ordering; /// /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"]; /// /// let loc = "ca-ES-valencia".parse::().unwrap(); /// assert_eq!( /// Ordering::Equal, /// loc.strict_cmp_iter(subtags.iter().copied()).end() /// ); /// /// let loc = "ca-ES".parse::().unwrap(); /// assert_eq!( /// Ordering::Less, /// loc.strict_cmp_iter(subtags.iter().copied()).end() /// ); /// /// let loc = "ca-ZA".parse::().unwrap(); /// assert_eq!( /// Ordering::Greater, /// loc.strict_cmp_iter(subtags.iter().copied()).end() /// ); /// ``` pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult where I: Iterator, { let r = self.for_each_subtag_str(&mut |subtag| { if let Some(other) = subtags.next() { match subtag.as_bytes().cmp(other) { Ordering::Equal => Ok(()), not_equal => Err(not_equal), } } else { Err(Ordering::Greater) } }); match r { Ok(_) => SubtagOrderingResult::Subtags(subtags), Err(o) => SubtagOrderingResult::Ordering(o), } } /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string. /// /// The return value is equivalent to what would happen if you first parsed the /// BCP-47 string to a `LanguageIdentifier` and then performed a structucal comparison. /// /// # Examples /// /// ``` /// use icu::locid::LanguageIdentifier; /// use std::cmp::Ordering; /// /// let bcp47_strings: &[&str] = &[ /// "pl-LaTn-pL", /// "uNd", /// "UnD-adlm", /// "uNd-GB", /// "UND-FONIPA", /// "ZH", /// ]; /// /// for a in bcp47_strings { /// assert!(a.parse::().unwrap().normalizing_eq(a)); /// } /// ``` pub fn normalizing_eq(&self, other: &str) -> bool { macro_rules! subtag_matches { ($T:ty, $iter:ident, $expected:expr) => { $iter .next() .map(|b| <$T>::try_from_bytes(b) == Ok($expected)) .unwrap_or(false) }; } let mut iter = SubtagIterator::new(other.as_bytes()); if !subtag_matches!(subtags::Language, iter, self.language) { return false; } if let Some(ref script) = self.script { if !subtag_matches!(subtags::Script, iter, *script) { return false; } } if let Some(ref region) = self.region { if !subtag_matches!(subtags::Region, iter, *region) { return false; } } for variant in self.variants.iter() { if !subtag_matches!(subtags::Variant, iter, *variant) { return false; } } iter.next() == None } pub(crate) fn for_each_subtag_str(&self, f: &mut F) -> Result<(), E> where F: FnMut(&str) -> Result<(), E>, { f(self.language.as_str())?; if let Some(ref script) = self.script { f(script.as_str())?; } if let Some(ref region) = self.region { f(region.as_str())?; } for variant in self.variants.iter() { f(variant.as_str())?; } Ok(()) } } impl AsRef for LanguageIdentifier { fn as_ref(&self) -> &Self { self } } impl AsMut for LanguageIdentifier { fn as_mut(&mut self) -> &mut Self { self } } impl core::fmt::Debug for LanguageIdentifier { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { core::fmt::Display::fmt(&self, f) } } impl FromStr for LanguageIdentifier { type Err = ParserError; fn from_str(source: &str) -> Result { Self::try_from_bytes(source.as_bytes()) } } impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string()); #[test] fn test_writeable() { use writeable::assert_writeable_eq; assert_writeable_eq!(LanguageIdentifier::UND, "und"); assert_writeable_eq!("und-001".parse::().unwrap(), "und-001"); assert_writeable_eq!( "und-Mymr".parse::().unwrap(), "und-Mymr", ); assert_writeable_eq!( "my-Mymr-MM".parse::().unwrap(), "my-Mymr-MM", ); assert_writeable_eq!( "my-Mymr-MM-posix".parse::().unwrap(), "my-Mymr-MM-posix", ); assert_writeable_eq!( "zh-macos-posix".parse::().unwrap(), "zh-macos-posix", ); } /// # Examples /// /// ``` /// use icu::locid::{ /// langid, subtags_language as language, LanguageIdentifier, /// }; /// /// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en")); /// ``` impl From for LanguageIdentifier { fn from(language: subtags::Language) -> Self { Self { language, ..Default::default() } } } /// # Examples /// /// ``` /// use icu::locid::{langid, subtags_script as script, LanguageIdentifier}; /// /// assert_eq!( /// LanguageIdentifier::from(Some(script!("latn"))), /// langid!("und-Latn") /// ); /// ``` impl From> for LanguageIdentifier { fn from(script: Option) -> Self { Self { script, ..Default::default() } } } /// # Examples /// /// ``` /// use icu::locid::{langid, subtags_region as region, LanguageIdentifier}; /// /// assert_eq!( /// LanguageIdentifier::from(Some(region!("US"))), /// langid!("und-US") /// ); /// ``` impl From> for LanguageIdentifier { fn from(region: Option) -> Self { Self { region, ..Default::default() } } } /// Convert from an LSR tuple to a [`LanguageIdentifier`]. /// /// # Examples /// /// ``` /// use icu::locid::{ /// langid, subtags_language as language, subtags_region as region, /// subtags_script as script, LanguageIdentifier, /// }; /// /// let lang = language!("en"); /// let script = script!("Latn"); /// let region = region!("US"); /// assert_eq!( /// LanguageIdentifier::from((lang, Some(script), Some(region))), /// langid!("en-Latn-US") /// ); /// ``` impl From<( subtags::Language, Option, Option, )> for LanguageIdentifier { fn from( lsr: ( subtags::Language, Option, Option, ), ) -> Self { Self { language: lsr.0, script: lsr.1, region: lsr.2, ..Default::default() } } } /// Convert from a [`LanguageIdentifier`] to an LSR tuple. /// /// # Examples /// /// ``` /// use icu::locid::{ /// langid, subtags_language as language, subtags_region as region, /// subtags_script as script, /// }; /// /// let lid = langid!("en-Latn-US"); /// let (lang, script, region) = (&lid).into(); /// /// assert_eq!(lang, language!("en")); /// assert_eq!(script, Some(script!("Latn"))); /// assert_eq!(region, Some(region!("US"))); /// ``` impl From<&LanguageIdentifier> for ( subtags::Language, Option, Option, ) { fn from(langid: &LanguageIdentifier) -> Self { (langid.language, langid.script, langid.region) } }