summaryrefslogtreecommitdiffstats
path: root/third_party/rust/unic-langid-impl/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/unic-langid-impl/src/lib.rs')
-rw-r--r--third_party/rust/unic-langid-impl/src/lib.rs516
1 files changed, 516 insertions, 0 deletions
diff --git a/third_party/rust/unic-langid-impl/src/lib.rs b/third_party/rust/unic-langid-impl/src/lib.rs
new file mode 100644
index 0000000000..0a6d0b34af
--- /dev/null
+++ b/third_party/rust/unic-langid-impl/src/lib.rs
@@ -0,0 +1,516 @@
+mod errors;
+mod layout_table;
+#[cfg(feature = "likelysubtags")]
+pub mod likelysubtags;
+#[doc(hidden)]
+pub mod parser;
+#[cfg(feature = "serde")]
+mod serde;
+pub mod subtags;
+
+pub use crate::errors::LanguageIdentifierError;
+use std::fmt::Write;
+use std::iter::Peekable;
+use std::str::FromStr;
+
+/// Enum representing available character direction orientations.
+#[derive(Debug, PartialEq)]
+pub enum CharacterDirection {
+ /// Right To Left
+ ///
+ /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc.
+ RTL,
+ /// Left To Right
+ ///
+ /// Used in languages such as French, Spanish, English, German etc.
+ LTR,
+}
+
+type PartsTuple = (
+ subtags::Language,
+ Option<subtags::Script>,
+ Option<subtags::Region>,
+ Vec<subtags::Variant>,
+);
+
+/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier.
+///
+/// # Examples
+///
+/// ```
+/// use unic_langid_impl::LanguageIdentifier;
+///
+/// let li: LanguageIdentifier = "en-US".parse()
+/// .expect("Failed to parse.");
+///
+/// assert_eq!(li.language, "en");
+/// assert_eq!(li.script, None);
+/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
+/// assert_eq!(li.variants().len(), 0);
+/// ```
+///
+/// # Parsing
+///
+/// Unicode recognizes three levels of standard conformance for any language identifier:
+///
+/// * *well-formed* - syntactically correct
+/// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
+/// * *canonical* - valid and no deprecated codes or structure.
+///
+/// At the moment parsing normalizes a well-formed language identifier converting
+/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
+///
+/// Any bogus subtags will cause the parsing to fail with an error.
+/// No subtag validation is performed.
+///
+/// # Examples:
+///
+/// ```
+/// use unic_langid_impl::LanguageIdentifier;
+///
+/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse()
+/// .expect("Failed to parse.");
+///
+/// assert_eq!(li.language, "en");
+/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn"));
+/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
+/// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
+/// ```
+#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
+pub struct LanguageIdentifier {
+ pub language: subtags::Language,
+ pub script: Option<subtags::Script>,
+ pub region: Option<subtags::Region>,
+ variants: Option<Box<[subtags::Variant]>>,
+}
+
+impl LanguageIdentifier {
+ /// A constructor which takes a utf8 slice, parses it and
+ /// produces a well-formed `LanguageIdentifier`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let li = LanguageIdentifier::from_bytes("en-US".as_bytes())
+ /// .expect("Parsing failed.");
+ ///
+ /// assert_eq!(li.to_string(), "en-US");
+ /// ```
+ pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> {
+ Ok(parser::parse_language_identifier(v)?)
+ }
+
+ /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and
+ /// produces a well-formed `LanguageIdentifier`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let li = LanguageIdentifier::from_parts(
+ /// "fr".parse().expect("Parsing failed."),
+ /// None,
+ /// Some("CA".parse().expect("Parsing failed.")),
+ /// &[]
+ /// );
+ ///
+ /// assert_eq!(li.to_string(), "fr-CA");
+ /// ```
+ pub fn from_parts(
+ language: subtags::Language,
+ script: Option<subtags::Script>,
+ region: Option<subtags::Region>,
+ variants: &[subtags::Variant],
+ ) -> Self {
+ let variants = if !variants.is_empty() {
+ let mut v = variants.to_vec();
+ v.sort_unstable();
+ v.dedup();
+ Some(v.into_boxed_slice())
+ } else {
+ None
+ };
+
+ Self {
+ language,
+ script,
+ region,
+ variants,
+ }
+ }
+
+ /// # Unchecked
+ ///
+ /// This function accepts subtags expecting variants
+ /// to be deduplicated and ordered.
+ pub const fn from_raw_parts_unchecked(
+ language: subtags::Language,
+ script: Option<subtags::Script>,
+ region: Option<subtags::Region>,
+ variants: Option<Box<[subtags::Variant]>>,
+ ) -> Self {
+ Self {
+ language,
+ script,
+ region,
+ variants,
+ }
+ }
+
+ #[doc(hidden)]
+ /// This method is used by `unic-locale` to handle partial
+ /// subtag iterator.
+ ///
+ /// Not stable.
+ pub fn try_from_iter<'a>(
+ iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
+ allow_extension: bool,
+ ) -> Result<LanguageIdentifier, LanguageIdentifierError> {
+ Ok(parser::parse_language_identifier_from_iter(
+ iter,
+ allow_extension,
+ )?)
+ }
+
+ /// Consumes `LanguageIdentifier` and produces raw internal representations
+ /// of all subtags in form of `u64`/`u32`.
+ ///
+ /// Primarily used for storing internal representation and restoring via
+ /// `from_raw_parts_unchecked`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ /// use tinystr::{TinyStr8, TinyStr4};
+ ///
+ /// let li: LanguageIdentifier = "en-US".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// let (lang, script, region, variants) = li.into_parts();
+ ///
+ /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked(
+ /// // lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }),
+ /// // script.map(|s| unsafe { TinyStr4::new_unchecked(s) }),
+ /// // region.map(|r| unsafe { TinyStr4::new_unchecked(r) }),
+ /// // variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()),
+ /// //);
+ ///
+ /// //assert_eq!(li2.to_string(), "en-US");
+ /// ```
+ pub fn into_parts(self) -> PartsTuple {
+ (
+ self.language,
+ self.script,
+ self.region,
+ self.variants.map_or_else(Vec::new, |v| v.to_vec()),
+ )
+ }
+
+ /// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier`
+ /// allowing for either side to use the missing fields as wildcards.
+ ///
+ /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let li1: LanguageIdentifier = "en".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// let li2: LanguageIdentifier = "en-US".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// assert_ne!(li1, li2); // "en" != "en-US"
+ /// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US"
+ ///
+ /// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US"
+ /// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US"
+ /// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*"
+ /// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*"
+ /// ```
+ pub fn matches<O: AsRef<Self>>(
+ &self,
+ other: &O,
+ self_as_range: bool,
+ other_as_range: bool,
+ ) -> bool {
+ let other = other.as_ref();
+ self.language
+ .matches(&other.language, self_as_range, other_as_range)
+ && subtag_matches(&self.script, &other.script, self_as_range, other_as_range)
+ && subtag_matches(&self.region, &other.region, self_as_range, other_as_range)
+ && subtags_match(
+ &self.variants,
+ &other.variants,
+ self_as_range,
+ other_as_range,
+ )
+ }
+
+ /// Returns a vector of variants subtags of the `LanguageIdentifier`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let li1: LanguageIdentifier = "ca-ES-valencia".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
+ ///
+ /// let li2: LanguageIdentifier = "de".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// assert_eq!(li2.variants().len(), 0);
+ /// ```
+ pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> {
+ let variants: &[_] = match self.variants {
+ Some(ref v) => &**v,
+ None => &[],
+ };
+
+ variants.iter()
+ }
+
+ /// Sets variant subtags of the `LanguageIdentifier`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let mut li: LanguageIdentifier = "ca-ES".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// li.set_variants(&["valencia".parse().expect("Parsing failed.")]);
+ ///
+ /// assert_eq!(li.to_string(), "ca-ES-valencia");
+ /// ```
+ pub fn set_variants(&mut self, variants: &[subtags::Variant]) {
+ let mut v = variants.to_vec();
+
+ if v.is_empty() {
+ self.variants = None;
+ } else {
+ v.sort_unstable();
+ v.dedup();
+ self.variants = Some(v.into_boxed_slice());
+ }
+ }
+
+ /// Tests if a variant subtag is present in the `LanguageIdentifier`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let mut li: LanguageIdentifier = "ca-ES-macos".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// assert_eq!(li.has_variant("valencia".parse().unwrap()), false);
+ /// assert_eq!(li.has_variant("macos".parse().unwrap()), true);
+ /// ```
+ pub fn has_variant(&self, variant: subtags::Variant) -> bool {
+ if let Some(variants) = &self.variants {
+ variants.contains(&variant)
+ } else {
+ false
+ }
+ }
+
+ /// Clears variant subtags of the `LanguageIdentifier`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let mut li: LanguageIdentifier = "ca-ES-valencia".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// li.clear_variants();
+ ///
+ /// assert_eq!(li.to_string(), "ca-ES");
+ /// ```
+ pub fn clear_variants(&mut self) {
+ self.variants = None;
+ }
+
+ /// Extends the `LanguageIdentifier` adding likely subtags based
+ /// on tables provided by CLDR.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let mut li: LanguageIdentifier = "en-US".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// assert_eq!(li.maximize(), true);
+ /// assert_eq!(li.to_string(), "en-Latn-US");
+ /// ```
+ #[cfg(feature = "likelysubtags")]
+ pub fn maximize(&mut self) -> bool {
+ if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) {
+ self.language = new_li.0;
+ self.script = new_li.1;
+ self.region = new_li.2;
+ true
+ } else {
+ false
+ }
+ }
+
+ /// Extends the `LanguageIdentifier` removing likely subtags based
+ /// on tables provided by CLDR.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::LanguageIdentifier;
+ ///
+ /// let mut li: LanguageIdentifier = "en-Latn-US".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// assert_eq!(li.minimize(), true);
+ /// assert_eq!(li.to_string(), "en");
+ /// ```
+ #[cfg(feature = "likelysubtags")]
+ pub fn minimize(&mut self) -> bool {
+ if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) {
+ self.language = new_li.0;
+ self.script = new_li.1;
+ self.region = new_li.2;
+ true
+ } else {
+ false
+ }
+ }
+
+ /// Returns character direction of the `LanguageIdentifier`.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection};
+ ///
+ /// let li1: LanguageIdentifier = "es-AR".parse()
+ /// .expect("Parsing failed.");
+ /// let li2: LanguageIdentifier = "fa".parse()
+ /// .expect("Parsing failed.");
+ ///
+ /// assert_eq!(li1.character_direction(), CharacterDirection::LTR);
+ /// assert_eq!(li2.character_direction(), CharacterDirection::RTL);
+ /// ```
+ pub fn character_direction(&self) -> CharacterDirection {
+ match (self.language.into(), self.script) {
+ (_, Some(script))
+ if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) =>
+ {
+ CharacterDirection::RTL
+ }
+ (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => {
+ CharacterDirection::RTL
+ }
+ _ => CharacterDirection::LTR,
+ }
+ }
+}
+
+impl FromStr for LanguageIdentifier {
+ type Err = LanguageIdentifierError;
+
+ fn from_str(source: &str) -> Result<Self, Self::Err> {
+ Self::from_bytes(source.as_bytes())
+ }
+}
+
+impl AsRef<LanguageIdentifier> for LanguageIdentifier {
+ #[inline(always)]
+ fn as_ref(&self) -> &LanguageIdentifier {
+ self
+ }
+}
+
+impl std::fmt::Display for LanguageIdentifier {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ self.language.fmt(f)?;
+ if let Some(ref script) = self.script {
+ f.write_char('-')?;
+ script.fmt(f)?;
+ }
+ if let Some(ref region) = self.region {
+ f.write_char('-')?;
+ region.fmt(f)?;
+ }
+ if let Some(variants) = &self.variants {
+ for variant in variants.iter() {
+ f.write_char('-')?;
+ variant.fmt(f)?;
+ }
+ }
+ Ok(())
+ }
+}
+
+impl PartialEq<&str> for LanguageIdentifier {
+ fn eq(&self, other: &&str) -> bool {
+ self.to_string().as_str() == *other
+ }
+}
+
+fn subtag_matches<P: PartialEq>(
+ subtag1: &Option<P>,
+ subtag2: &Option<P>,
+ as_range1: bool,
+ as_range2: bool,
+) -> bool {
+ (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2
+}
+
+fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool {
+ subtag.as_ref().map_or(true, |t| t.is_empty())
+}
+
+fn subtags_match<P: PartialEq>(
+ subtag1: &Option<Box<[P]>>,
+ subtag2: &Option<Box<[P]>>,
+ as_range1: bool,
+ as_range2: bool,
+) -> bool {
+ // or is some and is empty!
+ (as_range1 && is_option_empty(subtag1))
+ || (as_range2 && is_option_empty(subtag2))
+ || subtag1 == subtag2
+}
+
+/// This is a best-effort operation that performs all available levels of canonicalization.
+///
+/// At the moment the operation will normalize casing and the separator, but in the future
+/// it may also validate and update from deprecated subtags to canonical ones.
+///
+/// # Examples
+///
+/// ```
+/// use unic_langid_impl::canonicalize;
+///
+/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string()));
+/// ```
+pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> {
+ let lang_id = LanguageIdentifier::from_bytes(input.as_ref())?;
+ Ok(lang_id.to_string())
+}
+
+#[test]
+fn invalid_subtag() {
+ assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err());
+}