#![doc = include_str!("../README.md")] #![deny(unsafe_code)] #[cfg(feature = "serde")] use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::borrow::{Borrow, Cow}; use std::cmp::Ordering; use std::error::Error; use std::fmt; use std::hash::{Hash, Hasher}; use std::iter::once; use std::ops::Deref; use std::str::{FromStr, Split}; /// A [RFC 5646](https://tools.ietf.org/html/rfc5646) language tag. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("en-us").unwrap(); /// assert_eq!(language_tag.into_inner(), "en-us") /// ``` #[derive(Copy, Clone)] pub struct LanguageTag { tag: T, positions: TagElementsPositions, } impl> LanguageTag { /// Parses a language tag acccording to [RFC 5646](https://tools.ietf.org/html/rfc5646). /// and checks if the tag is ["well-formed"](https://tools.ietf.org/html/rfc5646#section-2.2.9). /// /// This operation keeps internally the `tag` parameter and does not allocate on the heap. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("en-us").unwrap(); /// assert_eq!(language_tag.into_inner(), "en-us") /// ``` pub fn parse(tag: T) -> Result { let positions = parse_language_tag(&tag, &mut VoidOutputBuffer::default())?; Ok(Self { tag, positions }) } /// Returns the underlying language tag representation. #[inline] pub fn as_str(&self) -> &str { &self.tag } /// Returns the underlying language tag representation. #[inline] pub fn into_inner(self) -> T { self.tag } /// Returns the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.primary_language(), "zh"); /// ``` #[inline] pub fn primary_language(&self) -> &str { &self.tag[..self.positions.language_end] } /// Returns the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2). /// /// Valid language tags have at most one extended language. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.extended_language(), Some("cmn")); /// ``` #[inline] pub fn extended_language(&self) -> Option<&str> { if self.positions.language_end == self.positions.extlang_end { None } else { Some(&self.tag[self.positions.language_end + 1..self.positions.extlang_end]) } } /// Iterates on the [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2). /// /// Valid language tags have at most one extended language. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.extended_language_subtags().collect::>(), vec!["cmn"]); /// ``` #[inline] pub fn extended_language_subtags(&self) -> impl Iterator { self.extended_language().unwrap_or("").split_terminator('-') } /// Returns the [primary language subtag](https://tools.ietf.org/html/rfc5646#section-2.2.1) /// and its [extended language subtags](https://tools.ietf.org/html/rfc5646#section-2.2.2). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.full_language(), "zh-cmn"); /// ``` #[inline] pub fn full_language(&self) -> &str { &self.tag[..self.positions.extlang_end] } /// Returns the [script subtag](https://tools.ietf.org/html/rfc5646#section-2.2.3). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.script(), Some("Hans")); /// ``` #[inline] pub fn script(&self) -> Option<&str> { if self.positions.extlang_end == self.positions.script_end { None } else { Some(&self.tag[self.positions.extlang_end + 1..self.positions.script_end]) } } /// Returns the [region subtag](https://tools.ietf.org/html/rfc5646#section-2.2.4). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-cmn-Hans-CN").unwrap(); /// assert_eq!(language_tag.region(), Some("CN")); /// ``` #[inline] pub fn region(&self) -> Option<&str> { if self.positions.script_end == self.positions.region_end { None } else { Some(&self.tag[self.positions.script_end + 1..self.positions.region_end]) } } /// Returns the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap(); /// assert_eq!(language_tag.variant(), Some("pinyin")); /// ``` #[inline] pub fn variant(&self) -> Option<&str> { if self.positions.region_end == self.positions.variant_end { None } else { Some(&self.tag[self.positions.region_end + 1..self.positions.variant_end]) } } /// Iterates on the [variant subtags](https://tools.ietf.org/html/rfc5646#section-2.2.5). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("zh-Latn-TW-pinyin").unwrap(); /// assert_eq!(language_tag.variant_subtags().collect::>(), vec!["pinyin"]); /// ``` #[inline] pub fn variant_subtags(&self) -> impl Iterator { self.variant().unwrap_or("").split_terminator('-') } /// Returns the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap(); /// assert_eq!(language_tag.extension(), Some("u-co-phonebk")); /// ``` #[inline] pub fn extension(&self) -> Option<&str> { if self.positions.variant_end == self.positions.extension_end { None } else { Some(&self.tag[self.positions.variant_end + 1..self.positions.extension_end]) } } /// Iterates on the [extension subtags](https://tools.ietf.org/html/rfc5646#section-2.2.6). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("de-DE-u-co-phonebk").unwrap(); /// assert_eq!(language_tag.extension_subtags().collect::>(), vec![('u', "co-phonebk")]); /// ``` #[inline] pub fn extension_subtags(&self) -> impl Iterator { match self.extension() { Some(parts) => ExtensionsIterator::new(parts), None => ExtensionsIterator::new(""), } } /// Returns the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap(); /// assert_eq!(language_tag.private_use(), Some("x-foo-bar")); /// ``` #[inline] pub fn private_use(&self) -> Option<&str> { if self.tag.starts_with("x-") { Some(&self.tag) } else if self.positions.extension_end == self.tag.len() { None } else { Some(&self.tag[self.positions.extension_end + 1..]) } } /// Iterates on the [private use subtags](https://tools.ietf.org/html/rfc5646#section-2.2.7). /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse("de-x-foo-bar").unwrap(); /// assert_eq!(language_tag.private_use_subtags().collect::>(), vec!["foo", "bar"]); /// ``` #[inline] pub fn private_use_subtags(&self) -> impl Iterator { self.private_use() .map(|part| &part[2..]) .unwrap_or("") .split_terminator('-') } } impl LanguageTag { /// Parses a language tag acccording to [RFC 5646](https://tools.ietf.org/html/rfc5646) /// and normalizes its case. /// /// This parser accepts the language tags that are "well-formed" according to /// [RFC 5646](https://tools.ietf.org/html/rfc5646#section-2.2.9). /// /// This operation does heap allocation. /// /// ``` /// use oxilangtag::LanguageTag; /// /// let language_tag = LanguageTag::parse_and_normalize("en-us").unwrap(); /// assert_eq!(language_tag.into_inner(), "en-US") /// ``` pub fn parse_and_normalize(tag: &str) -> Result { let mut output_buffer = String::with_capacity(tag.len()); let positions = parse_language_tag(tag, &mut output_buffer)?; Ok(Self { tag: output_buffer, positions, }) } } impl, Rhs> PartialEq> for LanguageTag { #[inline] fn eq(&self, other: &LanguageTag) -> bool { self.tag.eq(&other.tag) } } impl> PartialEq for LanguageTag { #[inline] fn eq(&self, other: &str) -> bool { self.tag.eq(other) } } impl<'a, T: PartialEq<&'a str>> PartialEq<&'a str> for LanguageTag { #[inline] fn eq(&self, other: &&'a str) -> bool { self.tag.eq(other) } } impl> PartialEq for LanguageTag { #[inline] fn eq(&self, other: &String) -> bool { self.tag.eq(other) } } impl<'a, T: PartialEq>> PartialEq> for LanguageTag { #[inline] fn eq(&self, other: &Cow<'a, str>) -> bool { self.tag.eq(other) } } impl> PartialEq> for str { #[inline] fn eq(&self, other: &LanguageTag) -> bool { other.tag.eq(self) } } impl<'a, T: PartialEq<&'a str>> PartialEq> for &'a str { #[inline] fn eq(&self, other: &LanguageTag) -> bool { other.tag.eq(self) } } impl> PartialEq> for String { #[inline] fn eq(&self, other: &LanguageTag) -> bool { other.tag.eq(self) } } impl<'a, T: PartialEq>> PartialEq> for Cow<'a, str> { #[inline] fn eq(&self, other: &LanguageTag) -> bool { other.tag.eq(self) } } impl Eq for LanguageTag {} impl Hash for LanguageTag { #[inline] fn hash(&self, state: &mut H) { self.tag.hash(state) } } impl PartialOrd for LanguageTag { #[inline] fn partial_cmp(&self, other: &Self) -> Option { self.tag.partial_cmp(&other.tag) } } impl Ord for LanguageTag { #[inline] fn cmp(&self, other: &Self) -> Ordering { self.tag.cmp(&other.tag) } } impl> Deref for LanguageTag { type Target = str; #[inline] fn deref(&self) -> &str { self.tag.deref() } } impl> AsRef for LanguageTag { #[inline] fn as_ref(&self) -> &str { self.tag.as_ref() } } impl> Borrow for LanguageTag { #[inline] fn borrow(&self) -> &str { self.tag.borrow() } } impl fmt::Debug for LanguageTag { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.tag.fmt(f) } } impl fmt::Display for LanguageTag { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.tag.fmt(f) } } impl FromStr for LanguageTag { type Err = LanguageTagParseError; #[inline] fn from_str(tag: &str) -> Result { Self::parse_and_normalize(tag) } } impl<'a> From> for LanguageTag { #[inline] fn from(tag: LanguageTag<&'a str>) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } impl<'a> From>> for LanguageTag { #[inline] fn from(tag: LanguageTag>) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } impl From>> for LanguageTag { #[inline] fn from(tag: LanguageTag>) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } impl<'a> From> for LanguageTag> { #[inline] fn from(tag: LanguageTag<&'a str>) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } impl<'a> From> for LanguageTag> { #[inline] fn from(tag: LanguageTag) -> Self { Self { tag: tag.tag.into(), positions: tag.positions, } } } #[cfg(feature = "serde")] impl Serialize for LanguageTag { fn serialize(&self, serializer: S) -> Result { self.tag.serialize(serializer) } } #[cfg(feature = "serde")] impl<'de, T: Deref + Deserialize<'de>> Deserialize<'de> for LanguageTag { fn deserialize>(deserializer: D) -> Result, D::Error> { use serde::de::Error; Self::parse(T::deserialize(deserializer)?).map_err(D::Error::custom) } } /// An error raised during [`LanguageTag`](struct.LanguageTag.html) validation. #[derive(Debug)] pub struct LanguageTagParseError { kind: TagParseErrorKind, } impl fmt::Display for LanguageTagParseError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.kind { TagParseErrorKind::EmptyExtension => { write!(f, "If an extension subtag is present, it must not be empty") } TagParseErrorKind::EmptyPrivateUse => { write!(f, "If the `x` subtag is present, it must not be empty") } TagParseErrorKind::ForbiddenChar => { write!(f, "The langtag contains a char not allowed") } TagParseErrorKind::InvalidSubtag => write!( f, "A subtag fails to parse, it does not match any other subtags" ), TagParseErrorKind::InvalidLanguage => write!(f, "The given language subtag is invalid"), TagParseErrorKind::SubtagTooLong => { write!(f, "A subtag may be eight characters in length at maximum") } TagParseErrorKind::EmptySubtag => write!(f, "A subtag should not be empty"), TagParseErrorKind::TooManyExtlangs => { write!(f, "At maximum three extlangs are allowed") } } } } impl Error for LanguageTagParseError {} #[derive(Debug)] enum TagParseErrorKind { /// If an extension subtag is present, it must not be empty. EmptyExtension, /// If the `x` subtag is present, it must not be empty. EmptyPrivateUse, /// The langtag contains a char that is not A-Z, a-z, 0-9 or the dash. ForbiddenChar, /// A subtag fails to parse, it does not match any other subtags. InvalidSubtag, /// The given language subtag is invalid. InvalidLanguage, /// A subtag may be eight characters in length at maximum. SubtagTooLong, /// A subtag should not be empty. EmptySubtag, /// At maximum three extlangs are allowed, but zero to one extlangs are preferred. TooManyExtlangs, } #[derive(Copy, Clone, Debug)] struct TagElementsPositions { language_end: usize, extlang_end: usize, script_end: usize, region_end: usize, variant_end: usize, extension_end: usize, } trait OutputBuffer: Extend { fn push(&mut self, c: char); fn push_str(&mut self, s: &str); } #[derive(Default)] struct VoidOutputBuffer {} impl OutputBuffer for VoidOutputBuffer { #[inline] fn push(&mut self, _: char) {} #[inline] fn push_str(&mut self, _: &str) {} } impl Extend for VoidOutputBuffer { #[inline] fn extend>(&mut self, _: T) {} } impl OutputBuffer for String { #[inline] fn push(&mut self, c: char) { self.push(c); } #[inline] fn push_str(&mut self, s: &str) { self.push_str(s); } } /// Parses language tag following [the RFC5646 grammar](https://tools.ietf.org/html/rfc5646#section-2.1) fn parse_language_tag( input: &str, output: &mut impl OutputBuffer, ) -> Result { //grandfathered tags if let Some(tag) = GRANDFATHEREDS .iter() .find(|record| record.eq_ignore_ascii_case(input)) { output.push_str(tag); Ok(TagElementsPositions { language_end: tag.len(), extlang_end: tag.len(), script_end: tag.len(), region_end: tag.len(), variant_end: tag.len(), extension_end: tag.len(), }) } else if input.starts_with("x-") || input.starts_with("X-") { // private use if !is_alphanumeric_or_dash(input) { Err(LanguageTagParseError { kind: TagParseErrorKind::ForbiddenChar, }) } else if input.len() == 2 { Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyPrivateUse, }) } else { output.extend(input.chars().map(|c| c.to_ascii_lowercase())); Ok(TagElementsPositions { language_end: input.len(), extlang_end: input.len(), script_end: input.len(), region_end: input.len(), variant_end: input.len(), extension_end: input.len(), }) } } else { parse_langtag(input, output) } } /// Handles normal tags. fn parse_langtag( input: &str, output: &mut impl OutputBuffer, ) -> Result { #[derive(PartialEq, Eq)] enum State { Start, AfterLanguage, AfterExtLang, AfterScript, AfterRegion, InExtension { expected: bool }, InPrivateUse { expected: bool }, } let mut state = State::Start; let mut language_end = 0; let mut extlang_end = 0; let mut script_end = 0; let mut region_end = 0; let mut variant_end = 0; let mut extension_end = 0; let mut extlangs_count = 0; for (subtag, end) in SubTagIterator::new(input) { if subtag.is_empty() { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptySubtag, }); } if subtag.len() > 8 { return Err(LanguageTagParseError { kind: TagParseErrorKind::SubtagTooLong, }); } if state == State::Start { // Primary language if subtag.len() < 2 || !is_alphabetic(subtag) { return Err(LanguageTagParseError { kind: TagParseErrorKind::InvalidLanguage, }); } language_end = end; output.extend(to_lowercase(subtag)); if subtag.len() < 4 { // extlangs are only allowed for short language tags state = State::AfterLanguage; } else { state = State::AfterExtLang; } } else if let State::InPrivateUse { .. } = state { if !is_alphanumeric(subtag) { return Err(LanguageTagParseError { kind: TagParseErrorKind::InvalidSubtag, }); } output.push('-'); output.extend(to_lowercase(subtag)); state = State::InPrivateUse { expected: false }; } else if subtag == "x" || subtag == "X" { // We make sure extension is found if let State::InExtension { expected: true } = state { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyExtension, }); } output.push('-'); output.push('x'); state = State::InPrivateUse { expected: true }; } else if subtag.len() == 1 && is_alphanumeric(subtag) { // We make sure extension is found if let State::InExtension { expected: true } = state { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyExtension, }); } let extension_tag = subtag.chars().next().unwrap().to_ascii_lowercase(); output.push('-'); output.push(extension_tag); state = State::InExtension { expected: true }; } else if let State::InExtension { .. } = state { if !is_alphanumeric(subtag) { return Err(LanguageTagParseError { kind: TagParseErrorKind::InvalidSubtag, }); } extension_end = end; output.push('-'); output.extend(to_lowercase(subtag)); state = State::InExtension { expected: false }; } else if state == State::AfterLanguage && subtag.len() == 3 && is_alphabetic(subtag) { extlangs_count += 1; if extlangs_count > 3 { return Err(LanguageTagParseError { kind: TagParseErrorKind::TooManyExtlangs, }); } // valid extlangs extlang_end = end; output.push('-'); output.extend(to_lowercase(subtag)); } else if (state == State::AfterLanguage || state == State::AfterExtLang) && subtag.len() == 4 && is_alphabetic(subtag) { // Script script_end = end; output.push('-'); output.extend(to_uppercase_first(subtag)); state = State::AfterScript; } else if (state == State::AfterLanguage || state == State::AfterExtLang || state == State::AfterScript) && (subtag.len() == 2 && is_alphabetic(subtag) || subtag.len() == 3 && is_numeric(subtag)) { // Region region_end = end; output.push('-'); output.extend(to_uppercase(subtag)); state = State::AfterRegion; } else if (state == State::AfterLanguage || state == State::AfterExtLang || state == State::AfterScript || state == State::AfterRegion) && is_alphanumeric(subtag) && (subtag.len() >= 5 && is_alphabetic(&subtag[0..1]) || subtag.len() >= 4 && is_numeric(&subtag[0..1])) { // Variant variant_end = end; output.push('-'); output.extend(to_lowercase(subtag)); state = State::AfterRegion; } else { return Err(LanguageTagParseError { kind: TagParseErrorKind::InvalidSubtag, }); } } //We make sure we are in a correct final state if let State::InExtension { expected: true } = state { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyExtension, }); } if let State::InPrivateUse { expected: true } = state { return Err(LanguageTagParseError { kind: TagParseErrorKind::EmptyPrivateUse, }); } //We make sure we have not skipped anyone if extlang_end < language_end { extlang_end = language_end; } if script_end < extlang_end { script_end = extlang_end; } if region_end < script_end { region_end = script_end; } if variant_end < region_end { variant_end = region_end; } if extension_end < variant_end { extension_end = variant_end; } Ok(TagElementsPositions { language_end, extlang_end, script_end, region_end, variant_end, extension_end, }) } struct ExtensionsIterator<'a> { input: &'a str, } impl<'a> ExtensionsIterator<'a> { fn new(input: &'a str) -> Self { Self { input } } } impl<'a> Iterator for ExtensionsIterator<'a> { type Item = (char, &'a str); fn next(&mut self) -> Option<(char, &'a str)> { let mut parts_iterator = self.input.split_terminator('-'); let singleton = parts_iterator.next()?.chars().next().unwrap(); let mut content_size: usize = 2; for part in parts_iterator { if part.len() == 1 { let content = &self.input[2..content_size - 1]; self.input = &self.input[content_size..]; return Some((singleton, content)); } else { content_size += part.len() + 1; } } let result = self.input.get(2..).map(|content| (singleton, content)); self.input = ""; result } } struct SubTagIterator<'a> { split: Split<'a, char>, position: usize, } impl<'a> SubTagIterator<'a> { #[inline] fn new(input: &'a str) -> Self { Self { split: input.split('-'), position: 0, } } } impl<'a> Iterator for SubTagIterator<'a> { type Item = (&'a str, usize); #[inline] fn next(&mut self) -> Option<(&'a str, usize)> { let tag = self.split.next()?; let tag_end = self.position + tag.len(); self.position = tag_end + 1; Some((tag, tag_end)) } } #[inline] fn is_alphabetic(s: &str) -> bool { s.chars().all(|x| x.is_ascii_alphabetic()) } #[inline] fn is_numeric(s: &str) -> bool { s.chars().all(|x| x.is_ascii_digit()) } #[inline] fn is_alphanumeric(s: &str) -> bool { s.chars().all(|x| x.is_ascii_alphanumeric()) } #[inline] fn is_alphanumeric_or_dash(s: &str) -> bool { s.chars().all(|x| x.is_ascii_alphanumeric() || x == '-') } #[inline] fn to_uppercase(s: &str) -> impl Iterator + '_ { s.chars().map(|c| c.to_ascii_uppercase()) } // Beware: panics if s.len() == 0 (should never happen in our code) #[inline] fn to_uppercase_first(s: &str) -> impl Iterator + '_ { let mut chars = s.chars(); once(chars.next().unwrap().to_ascii_uppercase()).chain(chars.map(|c| c.to_ascii_lowercase())) } #[inline] fn to_lowercase(s: &str) -> impl Iterator + '_ { s.chars().map(|c| c.to_ascii_lowercase()) } const GRANDFATHEREDS: [&str; 26] = [ "art-lojban", "cel-gaulish", "en-GB-oed", "i-ami", "i-bnn", "i-default", "i-enochian", "i-hak", "i-klingon", "i-lux", "i-mingo", "i-navajo", "i-pwn", "i-tao", "i-tay", "i-tsu", "no-bok", "no-nyn", "sgn-BE-FR", "sgn-BE-NL", "sgn-CH-DE", "zh-guoyu", "zh-hakka", "zh-min", "zh-min-nan", "zh-xiang", ];