1 files changed, 516 insertions, 0 deletions
diff --git a/third_party/rust/unic-langid-impl/src/lib.rs b/third_party/rust/unic-langid-impl/src/lib.rs
new file mode 100644
index 0000000000..0a6d0b34af
--- /dev/null
+++ b/third_party/rust/unic-langid-impl/src/lib.rs
@@ -0,0 +1,516 @@
+mod errors;
+mod layout_table;
+#[cfg(feature = "likelysubtags")]
+pub mod likelysubtags;
+#[doc(hidden)]
+pub mod parser;
+#[cfg(feature = "serde")]
+mod serde;
+pub mod subtags;
+
+pub use crate::errors::LanguageIdentifierError;
+use std::fmt::Write;
+use std::iter::Peekable;
+use std::str::FromStr;
+
+/// Enum representing available character direction orientations.
+#[derive(Debug, PartialEq)]
+pub enum CharacterDirection {
+    /// Right To Left
+    ///
+    /// Used in languages such as Arabic, Hebrew, Fula, Kurdish etc.
+    RTL,
+    /// Left To Right
+    ///
+    /// Used in languages such as French, Spanish, English, German etc.
+    LTR,
+}
+
+type PartsTuple = (
+    subtags::Language,
+    Option<subtags::Script>,
+    Option<subtags::Region>,
+    Vec<subtags::Variant>,
+);
+
+/// `LanguageIdentifier` is a core struct representing a Unicode Language Identifier.
+///
+/// # Examples
+///
+/// ```
+/// use unic_langid_impl::LanguageIdentifier;
+///
+/// let li: LanguageIdentifier = "en-US".parse()
+///     .expect("Failed to parse.");
+///
+/// assert_eq!(li.language, "en");
+/// assert_eq!(li.script, None);
+/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
+/// assert_eq!(li.variants().len(), 0);
+/// ```
+///
+/// # Parsing
+///
+/// Unicode recognizes three levels of standard conformance for any language identifier:
+///
+///  * *well-formed* - syntactically correct
+///  * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
+///  * *canonical* - valid and no deprecated codes or structure.
+///
+/// At the moment parsing normalizes a well-formed language identifier converting
+/// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
+///
+/// Any bogus subtags will cause the parsing to fail with an error.
+/// No subtag validation is performed.
+///
+/// # Examples:
+///
+/// ```
+/// use unic_langid_impl::LanguageIdentifier;
+///
+/// let li: LanguageIdentifier = "eN_latn_Us-Valencia".parse()
+///     .expect("Failed to parse.");
+///
+/// assert_eq!(li.language, "en");
+/// assert_eq!(li.script.as_ref().map(Into::into), Some("Latn"));
+/// assert_eq!(li.region.as_ref().map(Into::into), Some("US"));
+/// assert_eq!(li.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
+/// ```
+#[derive(Default, Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord)]
+pub struct LanguageIdentifier {
+    pub language: subtags::Language,
+    pub script: Option<subtags::Script>,
+    pub region: Option<subtags::Region>,
+    variants: Option<Box<[subtags::Variant]>>,
+}
+
+impl LanguageIdentifier {
+    /// A constructor which takes a utf8 slice, parses it and
+    /// produces a well-formed `LanguageIdentifier`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let li = LanguageIdentifier::from_bytes("en-US".as_bytes())
+    ///     .expect("Parsing failed.");
+    ///
+    /// assert_eq!(li.to_string(), "en-US");
+    /// ```
+    pub fn from_bytes(v: &[u8]) -> Result<Self, LanguageIdentifierError> {
+        Ok(parser::parse_language_identifier(v)?)
+    }
+
+    /// A constructor which takes optional subtags as `AsRef<[u8]>`, parses them and
+    /// produces a well-formed `LanguageIdentifier`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let li = LanguageIdentifier::from_parts(
+    ///     "fr".parse().expect("Parsing failed."),
+    ///     None,
+    ///     Some("CA".parse().expect("Parsing failed.")),
+    ///     &[]
+    /// );
+    ///
+    /// assert_eq!(li.to_string(), "fr-CA");
+    /// ```
+    pub fn from_parts(
+        language: subtags::Language,
+        script: Option<subtags::Script>,
+        region: Option<subtags::Region>,
+        variants: &[subtags::Variant],
+    ) -> Self {
+        let variants = if !variants.is_empty() {
+            let mut v = variants.to_vec();
+            v.sort_unstable();
+            v.dedup();
+            Some(v.into_boxed_slice())
+        } else {
+            None
+        };
+
+        Self {
+            language,
+            script,
+            region,
+            variants,
+        }
+    }
+
+    /// # Unchecked
+    ///
+    /// This function accepts subtags expecting variants
+    /// to be deduplicated and ordered.
+    pub const fn from_raw_parts_unchecked(
+        language: subtags::Language,
+        script: Option<subtags::Script>,
+        region: Option<subtags::Region>,
+        variants: Option<Box<[subtags::Variant]>>,
+    ) -> Self {
+        Self {
+            language,
+            script,
+            region,
+            variants,
+        }
+    }
+
+    #[doc(hidden)]
+    /// This method is used by `unic-locale` to handle partial
+    /// subtag iterator.
+    ///
+    /// Not stable.
+    pub fn try_from_iter<'a>(
+        iter: &mut Peekable<impl Iterator<Item = &'a [u8]>>,
+        allow_extension: bool,
+    ) -> Result<LanguageIdentifier, LanguageIdentifierError> {
+        Ok(parser::parse_language_identifier_from_iter(
+            iter,
+            allow_extension,
+        )?)
+    }
+
+    /// Consumes `LanguageIdentifier` and produces raw internal representations
+    /// of all subtags in form of `u64`/`u32`.
+    ///
+    /// Primarily used for storing internal representation and restoring via
+    /// `from_raw_parts_unchecked`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    /// use tinystr::{TinyStr8, TinyStr4};
+    ///
+    /// let li: LanguageIdentifier = "en-US".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// let (lang, script, region, variants) = li.into_parts();
+    ///
+    /// // let li2 = LanguageIdentifier::from_raw_parts_unchecked(
+    /// //     lang.map(|l| unsafe { TinyStr8::new_unchecked(l) }),
+    /// //    script.map(|s| unsafe { TinyStr4::new_unchecked(s) }),
+    /// //    region.map(|r| unsafe { TinyStr4::new_unchecked(r) }),
+    /// //    variants.map(|v| v.into_iter().map(|v| unsafe { TinyStr8::new_unchecked(*v) }).collect()),
+    /// //);
+    ///
+    /// //assert_eq!(li2.to_string(), "en-US");
+    /// ```
+    pub fn into_parts(self) -> PartsTuple {
+        (
+            self.language,
+            self.script,
+            self.region,
+            self.variants.map_or_else(Vec::new, |v| v.to_vec()),
+        )
+    }
+
+    /// Compares a `LanguageIdentifier` to another `AsRef<LanguageIdentifier`
+    /// allowing for either side to use the missing fields as wildcards.
+    ///
+    /// This allows for matching between `en` (treated as `en-*-*-*`) and `en-US`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let li1: LanguageIdentifier = "en".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// let li2: LanguageIdentifier = "en-US".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// assert_ne!(li1, li2); // "en" != "en-US"
+    /// assert_ne!(li1.to_string(), li2.to_string()); // "en" != "en-US"
+    ///
+    /// assert_eq!(li1.matches(&li2, false, false), false); // "en" != "en-US"
+    /// assert_eq!(li1.matches(&li2, true, false), true); // "en-*-*-*" == "en-US"
+    /// assert_eq!(li1.matches(&li2, false, true), false); // "en" != "en-*-US-*"
+    /// assert_eq!(li1.matches(&li2, true, true), true); // "en-*-*-*" == "en-*-US-*"
+    /// ```
+    pub fn matches<O: AsRef<Self>>(
+        &self,
+        other: &O,
+        self_as_range: bool,
+        other_as_range: bool,
+    ) -> bool {
+        let other = other.as_ref();
+        self.language
+            .matches(&other.language, self_as_range, other_as_range)
+            && subtag_matches(&self.script, &other.script, self_as_range, other_as_range)
+            && subtag_matches(&self.region, &other.region, self_as_range, other_as_range)
+            && subtags_match(
+                &self.variants,
+                &other.variants,
+                self_as_range,
+                other_as_range,
+            )
+    }
+
+    /// Returns a vector of variants subtags of the `LanguageIdentifier`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let li1: LanguageIdentifier = "ca-ES-valencia".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// assert_eq!(li1.variants().map(|v| v.as_str()).collect::<Vec<_>>(), &["valencia"]);
+    ///
+    /// let li2: LanguageIdentifier = "de".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// assert_eq!(li2.variants().len(), 0);
+    /// ```
+    pub fn variants(&self) -> impl ExactSizeIterator<Item = &subtags::Variant> {
+        let variants: &[_] = match self.variants {
+            Some(ref v) => &**v,
+            None => &[],
+        };
+
+        variants.iter()
+    }
+
+    /// Sets variant subtags of the `LanguageIdentifier`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let mut li: LanguageIdentifier = "ca-ES".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// li.set_variants(&["valencia".parse().expect("Parsing failed.")]);
+    ///
+    /// assert_eq!(li.to_string(), "ca-ES-valencia");
+    /// ```
+    pub fn set_variants(&mut self, variants: &[subtags::Variant]) {
+        let mut v = variants.to_vec();
+
+        if v.is_empty() {
+            self.variants = None;
+        } else {
+            v.sort_unstable();
+            v.dedup();
+            self.variants = Some(v.into_boxed_slice());
+        }
+    }
+
+    /// Tests if a variant subtag is present in the `LanguageIdentifier`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let mut li: LanguageIdentifier = "ca-ES-macos".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// assert_eq!(li.has_variant("valencia".parse().unwrap()), false);
+    /// assert_eq!(li.has_variant("macos".parse().unwrap()), true);
+    /// ```
+    pub fn has_variant(&self, variant: subtags::Variant) -> bool {
+        if let Some(variants) = &self.variants {
+            variants.contains(&variant)
+        } else {
+            false
+        }
+    }
+
+    /// Clears variant subtags of the `LanguageIdentifier`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let mut li: LanguageIdentifier = "ca-ES-valencia".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// li.clear_variants();
+    ///
+    /// assert_eq!(li.to_string(), "ca-ES");
+    /// ```
+    pub fn clear_variants(&mut self) {
+        self.variants = None;
+    }
+
+    /// Extends the `LanguageIdentifier` adding likely subtags based
+    /// on tables provided by CLDR.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let mut li: LanguageIdentifier = "en-US".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// assert_eq!(li.maximize(), true);
+    /// assert_eq!(li.to_string(), "en-Latn-US");
+    /// ```
+    #[cfg(feature = "likelysubtags")]
+    pub fn maximize(&mut self) -> bool {
+        if let Some(new_li) = likelysubtags::maximize(self.language, self.script, self.region) {
+            self.language = new_li.0;
+            self.script = new_li.1;
+            self.region = new_li.2;
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Extends the `LanguageIdentifier` removing likely subtags based
+    /// on tables provided by CLDR.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::LanguageIdentifier;
+    ///
+    /// let mut li: LanguageIdentifier = "en-Latn-US".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// assert_eq!(li.minimize(), true);
+    /// assert_eq!(li.to_string(), "en");
+    /// ```
+    #[cfg(feature = "likelysubtags")]
+    pub fn minimize(&mut self) -> bool {
+        if let Some(new_li) = likelysubtags::minimize(self.language, self.script, self.region) {
+            self.language = new_li.0;
+            self.script = new_li.1;
+            self.region = new_li.2;
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Returns character direction of the `LanguageIdentifier`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use unic_langid_impl::{LanguageIdentifier, CharacterDirection};
+    ///
+    /// let li1: LanguageIdentifier = "es-AR".parse()
+    ///     .expect("Parsing failed.");
+    /// let li2: LanguageIdentifier = "fa".parse()
+    ///     .expect("Parsing failed.");
+    ///
+    /// assert_eq!(li1.character_direction(), CharacterDirection::LTR);
+    /// assert_eq!(li2.character_direction(), CharacterDirection::RTL);
+    /// ```
+    pub fn character_direction(&self) -> CharacterDirection {
+        match (self.language.into(), self.script) {
+            (_, Some(script))
+                if layout_table::SCRIPTS_CHARACTER_DIRECTION_RTL.contains(&script.into()) =>
+            {
+                CharacterDirection::RTL
+            }
+            (Some(lang), _) if layout_table::LANGS_CHARACTER_DIRECTION_RTL.contains(&lang) => {
+                CharacterDirection::RTL
+            }
+            _ => CharacterDirection::LTR,
+        }
+    }
+}
+
+impl FromStr for LanguageIdentifier {
+    type Err = LanguageIdentifierError;
+
+    fn from_str(source: &str) -> Result<Self, Self::Err> {
+        Self::from_bytes(source.as_bytes())
+    }
+}
+
+impl AsRef<LanguageIdentifier> for LanguageIdentifier {
+    #[inline(always)]
+    fn as_ref(&self) -> &LanguageIdentifier {
+        self
+    }
+}
+
+impl std::fmt::Display for LanguageIdentifier {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        self.language.fmt(f)?;
+        if let Some(ref script) = self.script {
+            f.write_char('-')?;
+            script.fmt(f)?;
+        }
+        if let Some(ref region) = self.region {
+            f.write_char('-')?;
+            region.fmt(f)?;
+        }
+        if let Some(variants) = &self.variants {
+            for variant in variants.iter() {
+                f.write_char('-')?;
+                variant.fmt(f)?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl PartialEq<&str> for LanguageIdentifier {
+    fn eq(&self, other: &&str) -> bool {
+        self.to_string().as_str() == *other
+    }
+}
+
+fn subtag_matches<P: PartialEq>(
+    subtag1: &Option<P>,
+    subtag2: &Option<P>,
+    as_range1: bool,
+    as_range2: bool,
+) -> bool {
+    (as_range1 && subtag1.is_none()) || (as_range2 && subtag2.is_none()) || subtag1 == subtag2
+}
+
+fn is_option_empty<P: PartialEq>(subtag: &Option<Box<[P]>>) -> bool {
+    subtag.as_ref().map_or(true, |t| t.is_empty())
+}
+
+fn subtags_match<P: PartialEq>(
+    subtag1: &Option<Box<[P]>>,
+    subtag2: &Option<Box<[P]>>,
+    as_range1: bool,
+    as_range2: bool,
+) -> bool {
+    // or is some and is empty!
+    (as_range1 && is_option_empty(subtag1))
+        || (as_range2 && is_option_empty(subtag2))
+        || subtag1 == subtag2
+}
+
+/// This is a best-effort operation that performs all available levels of canonicalization.
+///
+/// At the moment the operation will normalize casing and the separator, but in the future
+/// it may also validate and update from deprecated subtags to canonical ones.
+///
+/// # Examples
+///
+/// ```
+/// use unic_langid_impl::canonicalize;
+///
+/// assert_eq!(canonicalize("pL_latn_pl"), Ok("pl-Latn-PL".to_string()));
+/// ```
+pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, LanguageIdentifierError> {
+    let lang_id = LanguageIdentifier::from_bytes(input.as_ref())?;
+    Ok(lang_id.to_string())
+}
+
+#[test]
+fn invalid_subtag() {
+    assert!(LanguageIdentifier::from_bytes("en-ÁÁÁÁ".as_bytes()).is_err());
+}