diff options
Diffstat (limited to 'vendor/icu_locid/src/locale.rs')
-rw-r--r-- | vendor/icu_locid/src/locale.rs | 528 |
1 files changed, 528 insertions, 0 deletions
diff --git a/vendor/icu_locid/src/locale.rs b/vendor/icu_locid/src/locale.rs new file mode 100644 index 000000000..d7040d31a --- /dev/null +++ b/vendor/icu_locid/src/locale.rs @@ -0,0 +1,528 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::ordering::SubtagOrderingResult; +use crate::parser::{ + get_subtag_iterator, parse_locale, + parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, ParserError, + ParserMode, +}; +use crate::{extensions, subtags, LanguageIdentifier}; +use alloc::string::String; +use alloc::string::ToString; +use core::cmp::Ordering; +use core::str::FromStr; +use tinystr::TinyAsciiStr; + +/// A core struct representing a [`Unicode Locale Identifier`]. +/// +/// A locale is made of two parts: +/// * Unicode Language Identifier +/// * A set of Unicode Extensions +/// +/// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and +/// on top of that is able to parse, manipulate and serialize unicode extension fields. +/// +/// +/// # Examples +/// +/// ``` +/// use icu::locid::extensions::unicode::{Key, Value}; +/// use icu::locid::{subtags::*, Locale}; +/// +/// let loc: Locale = "en-US-u-ca-buddhist".parse().expect("Failed to parse."); +/// +/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap()); +/// assert_eq!(loc.id.script, None); +/// assert_eq!(loc.id.region, "US".parse::<Region>().ok()); +/// assert_eq!(loc.id.variants.len(), 0); +/// assert_eq!(loc.to_string(), "en-US-u-ca-buddhist"); +/// +/// let key: Key = "ca".parse().expect("Parsing key failed."); +/// let value: Value = "buddhist".parse().expect("Parsing value failed."); +/// assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value)); +/// ``` +/// +/// # Parsing +/// +/// Unicode recognizes three levels of standard conformance for a locale: +/// +/// * *well-formed* - syntactically correct +/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types... +/// * *canonical* - valid and no deprecated codes or structure. +/// +/// At the moment parsing normalizes a well-formed locale identifier converting +/// `_` separators to `-` and adjusting casing to conform to the Unicode standard. +/// +/// Any bogus subtags will cause the parsing to fail with an error. +/// No subtag validation or canonicalization is performed. +/// +/// # Examples +/// +/// ``` +/// use icu::locid::{subtags::*, Locale}; +/// +/// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12" +/// .parse() +/// .expect("Failed to parse."); +/// +/// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap()); +/// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok()); +/// assert_eq!(loc.id.region, "US".parse::<Region>().ok()); +/// assert_eq!( +/// loc.id.variants.get(0), +/// "valencia".parse::<Variant>().ok().as_ref() +/// ); +/// ``` +/// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier +#[derive(Default, PartialEq, Eq, Clone, Hash)] +#[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro) +pub struct Locale { + /// The basic language/script/region components in the locale identifier along with any variants. + pub id: LanguageIdentifier, + /// Any extensions present in the locale identifier. + pub extensions: extensions::Extensions, +} + +#[test] +fn test_sizes() { + assert_eq!(core::mem::size_of::<subtags::Language>(), 3); + assert_eq!(core::mem::size_of::<subtags::Script>(), 4); + assert_eq!(core::mem::size_of::<subtags::Region>(), 3); + assert_eq!(core::mem::size_of::<subtags::Variant>(), 8); + assert_eq!(core::mem::size_of::<subtags::Variants>(), 32); + assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 48); + + assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 72); + assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 48); + assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24); + + assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 24); + assert_eq!(core::mem::size_of::<extensions::unicode::Keywords>(), 48); + assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24); + assert_eq!(core::mem::size_of::<extensions::private::Private>(), 24); + assert_eq!(core::mem::size_of::<extensions::Extensions>(), 192); + + assert_eq!(core::mem::size_of::<Locale>(), 240); +} + +impl Locale { + /// A constructor which takes a utf8 slice, parses it and + /// produces a well-formed [`Locale`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// let loc = Locale::try_from_bytes("en-US-u-hc-h12".as_bytes()) + /// .expect("Parsing failed."); + /// + /// assert_eq!(loc.to_string(), "en-US-u-hc-h12"); + /// ``` + pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> { + parse_locale(v) + } + + /// The default undefined locale "und". Same as [`default()`](Default::default()). + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// assert_eq!(Locale::default(), Locale::UND); + /// assert_eq!("und", Locale::UND.to_string()); + /// ``` + pub const UND: Self = Self { + id: LanguageIdentifier::UND, + extensions: extensions::Extensions::new(), + }; + + /// This is a best-effort operation that performs all available levels of canonicalization. + /// + /// At the moment the operation will normalize casing and the separator, but in the future + /// it may also validate and update from deprecated subtags to canonical ones. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// + /// assert_eq!( + /// Locale::canonicalize("pL_latn_pl-U-HC-H12"), + /// Ok("pl-Latn-PL-u-hc-h12".to_string()) + /// ); + /// ``` + pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> { + let locale = Self::try_from_bytes(input.as_ref())?; + Ok(locale.to_string()) + } + + /// Compare this [`Locale`] with BCP-47 bytes. + /// + /// The return value is equivalent to what would happen if you first converted this + /// [`Locale`] to a BCP-47 string and then performed a byte comparison. + /// + /// This function is case-sensitive and results in a *total order*, so it is appropriate for + /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// use std::cmp::Ordering; + /// + /// let bcp47_strings: &[&str] = &[ + /// "pl-Latn-PL", + /// "und", + /// "und-fonipa", + /// "und-t-m0-true", + /// "und-u-ca-hebrew", + /// "und-u-ca-japanese", + /// "zh", + /// ]; + /// + /// for ab in bcp47_strings.windows(2) { + /// let a = ab[0]; + /// let b = ab[1]; + /// assert!(a.cmp(b) == Ordering::Less); + /// let a_loc = a.parse::<Locale>().unwrap(); + /// assert_eq!(a, a_loc.to_string()); + /// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal); + /// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less); + /// } + /// ``` + pub fn strict_cmp(&self, other: &[u8]) -> Ordering { + self.strict_cmp_iter(other.split(|b| *b == b'-')).end() + } + + /// Compare this [`Locale`] with an iterator of BCP-47 subtags. + /// + /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as + /// a more modular version that allows multiple subtag iterators to be chained together. + /// + /// For an additional example, see [`SubtagOrderingResult`]. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::locale; + /// use std::cmp::Ordering; + /// + /// let subtags: &[&[u8]] = + /// &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"]; + /// + /// let loc = locale!("ca-ES-valencia-u-ca-hebrew"); + /// assert_eq!( + /// Ordering::Equal, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// + /// let loc = locale!("ca-ES-valencia"); + /// assert_eq!( + /// Ordering::Less, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// + /// let loc = locale!("ca-ES-valencia-u-nu-arab"); + /// assert_eq!( + /// Ordering::Greater, + /// loc.strict_cmp_iter(subtags.iter().copied()).end() + /// ); + /// ``` + pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I> + where + I: Iterator<Item = &'l [u8]>, + { + let r = self.for_each_subtag_str(&mut |subtag| { + if let Some(other) = subtags.next() { + match subtag.as_bytes().cmp(other) { + Ordering::Equal => Ok(()), + not_equal => Err(not_equal), + } + } else { + Err(Ordering::Greater) + } + }); + match r { + Ok(_) => SubtagOrderingResult::Subtags(subtags), + Err(o) => SubtagOrderingResult::Ordering(o), + } + } + + /// Compare this `Locale` with a potentially unnormalized BCP-47 string. + /// + /// The return value is equivalent to what would happen if you first parsed the + /// BCP-47 string to a `Locale` and then performed a structucal comparison. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// use std::cmp::Ordering; + /// + /// let bcp47_strings: &[&str] = &[ + /// "pl-LaTn-pL", + /// "uNd", + /// "UND-FONIPA", + /// "UnD-t-m0-TrUe", + /// "uNd-u-CA-Japanese", + /// "ZH", + /// ]; + /// + /// for a in bcp47_strings { + /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a)); + /// } + /// ``` + pub fn normalizing_eq(&self, other: &str) -> bool { + macro_rules! subtag_matches { + ($T:ty, $iter:ident, $expected:expr) => { + $iter + .next() + .map(|b| <$T>::try_from_bytes(b) == Ok($expected)) + .unwrap_or(false) + }; + } + + let mut iter = get_subtag_iterator(other.as_bytes()); + if !subtag_matches!(subtags::Language, iter, self.id.language) { + return false; + } + if let Some(ref script) = self.id.script { + if !subtag_matches!(subtags::Script, iter, *script) { + return false; + } + } + if let Some(ref region) = self.id.region { + if !subtag_matches!(subtags::Region, iter, *region) { + return false; + } + } + for variant in self.id.variants.iter() { + if !subtag_matches!(subtags::Variant, iter, *variant) { + return false; + } + } + if !self.extensions.is_empty() { + match extensions::Extensions::try_from_iter(&mut iter) { + Ok(exts) => { + if self.extensions != exts { + return false; + } + } + Err(_) => { + return false; + } + } + } + iter.next() == None + } + + #[doc(hidden)] + #[allow(clippy::type_complexity)] + pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension( + v: &[u8], + ) -> Result< + ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + Option<subtags::Variant>, + Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>, + ), + ParserError, + > { + parse_locale_with_single_variant_single_keyword_unicode_keyword_extension( + v, + ParserMode::Locale, + ) + } + + pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> + where + F: FnMut(&str) -> Result<(), E>, + { + self.id.for_each_subtag_str(f)?; + self.extensions.for_each_subtag_str(f)?; + Ok(()) + } +} + +impl FromStr for Locale { + type Err = ParserError; + + fn from_str(source: &str) -> Result<Self, Self::Err> { + Self::try_from_bytes(source.as_bytes()) + } +} + +impl From<LanguageIdentifier> for Locale { + fn from(id: LanguageIdentifier) -> Self { + Self { + id, + extensions: extensions::Extensions::default(), + } + } +} + +impl From<Locale> for LanguageIdentifier { + fn from(loc: Locale) -> Self { + loc.id + } +} + +impl AsRef<LanguageIdentifier> for Locale { + fn as_ref(&self) -> &LanguageIdentifier { + &self.id + } +} + +impl AsMut<LanguageIdentifier> for Locale { + fn as_mut(&mut self) -> &mut LanguageIdentifier { + &mut self.id + } +} + +impl core::fmt::Debug for Locale { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + writeable::Writeable::write_to(self, f) + } +} + +impl_writeable_for_each_subtag_str_no_test!(Locale); + +#[test] +fn test_writeable() { + use writeable::assert_writeable_eq; + assert_writeable_eq!(Locale::UND, "und"); + assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001"); + assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr"); + assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM"); + assert_writeable_eq!( + "my-Mymr-MM-posix".parse::<Locale>().unwrap(), + "my-Mymr-MM-posix", + ); + assert_writeable_eq!( + "zh-macos-posix".parse::<Locale>().unwrap(), + "zh-macos-posix", + ); + assert_writeable_eq!( + "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(), + "my-t-my-d0-zawgyi", + ); + assert_writeable_eq!( + "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(), + "ar-SA-u-ca-islamic-civil", + ); + assert_writeable_eq!( + "en-001-x-foo-bar".parse::<Locale>().unwrap(), + "en-001-x-foo-bar", + ); + assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",); +} + +/// # Examples +/// +/// ``` +/// use icu::locid::subtags_language as language; +/// use icu::locid::Locale; +/// +/// let language = language!("en"); +/// let loc = Locale::from(language); +/// +/// assert_eq!(loc.id.language, language); +/// assert_eq!(loc.to_string(), "en"); +/// ``` +impl From<subtags::Language> for Locale { + fn from(language: subtags::Language) -> Self { + Self { + id: language.into(), + ..Default::default() + } + } +} + +/// # Examples +/// +/// ``` +/// use icu::locid::subtags_script as script; +/// use icu::locid::Locale; +/// +/// let script = script!("latn"); +/// let loc = Locale::from(Some(script)); +/// +/// assert_eq!(loc.id.script.unwrap(), script); +/// assert_eq!(loc.to_string(), "und-Latn"); +/// ``` +impl From<Option<subtags::Script>> for Locale { + fn from(script: Option<subtags::Script>) -> Self { + Self { + id: script.into(), + ..Default::default() + } + } +} + +/// # Examples +/// +/// ``` +/// use icu::locid::subtags_region as region; +/// use icu::locid::Locale; +/// +/// let region = region!("US"); +/// let loc = Locale::from(Some(region)); +/// +/// assert_eq!(loc.id.region.unwrap(), region); +/// assert_eq!(loc.to_string(), "und-US"); +/// ``` +impl From<Option<subtags::Region>> for Locale { + fn from(region: Option<subtags::Region>) -> Self { + Self { + id: region.into(), + ..Default::default() + } + } +} + +/// # Examples +/// +/// ``` +/// use icu::locid::Locale; +/// use icu::locid::{ +/// subtags_language as language, subtags_region as region, +/// subtags_script as script, +/// }; +/// +/// let lang = language!("en"); +/// let script = script!("Latn"); +/// let region = region!("US"); +/// let loc = Locale::from((lang, Some(script), Some(region))); +/// +/// assert_eq!(loc.id.language, lang); +/// assert_eq!(loc.id.script.unwrap(), script); +/// assert_eq!(loc.id.region.unwrap(), region); +/// assert_eq!(loc.id.variants.len(), 0); +/// assert_eq!(loc.to_string(), "en-Latn-US"); +/// ``` +impl + From<( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + )> for Locale +{ + fn from( + lsr: ( + subtags::Language, + Option<subtags::Script>, + Option<subtags::Region>, + ), + ) -> Self { + Self { + id: lsr.into(), + ..Default::default() + } + } +} |