diff options
Diffstat (limited to 'vendor/icu_locid/src/parser')
-rw-r--r-- | vendor/icu_locid/src/parser/errors.rs | 16 | ||||
-rw-r--r-- | vendor/icu_locid/src/parser/langid.rs | 47 | ||||
-rw-r--r-- | vendor/icu_locid/src/parser/locale.rs | 6 | ||||
-rw-r--r-- | vendor/icu_locid/src/parser/mod.rs | 231 |
4 files changed, 229 insertions, 71 deletions
diff --git a/vendor/icu_locid/src/parser/errors.rs b/vendor/icu_locid/src/parser/errors.rs index a989bcc60..5cbbb2bd4 100644 --- a/vendor/icu_locid/src/parser/errors.rs +++ b/vendor/icu_locid/src/parser/errors.rs @@ -48,6 +48,22 @@ pub enum ParserError { /// ``` #[displaydoc("Invalid extension")] InvalidExtension, + + /// Duplicated extension. + /// + /// # Examples + /// + /// ``` + /// use icu::locid::Locale; + /// use icu::locid::ParserError; + /// + /// assert_eq!( + /// "und-u-hc-h12-u-ca-calendar".parse::<Locale>(), + /// Err(ParserError::DuplicatedExtension) + /// ); + /// ``` + #[displaydoc("Duplicated extension")] + DuplicatedExtension, } #[cfg(feature = "std")] diff --git a/vendor/icu_locid/src/parser/langid.rs b/vendor/icu_locid/src/parser/langid.rs index 9efa078ac..653ca7e6e 100644 --- a/vendor/icu_locid/src/parser/langid.rs +++ b/vendor/icu_locid/src/parser/langid.rs @@ -5,7 +5,7 @@ pub use super::errors::ParserError; use crate::extensions::unicode::{Attribute, Key, Value}; use crate::extensions::ExtensionType; -use crate::parser::{get_subtag_iterator, SubtagIterator}; +use crate::parser::SubtagIterator; use crate::LanguageIdentifier; use crate::{extensions, subtags}; use alloc::vec::Vec; @@ -103,7 +103,7 @@ pub fn parse_language_identifier( t: &[u8], mode: ParserMode, ) -> Result<LanguageIdentifier, ParserError> { - let mut iter = get_subtag_iterator(t); + let mut iter = SubtagIterator::new(t); parse_language_identifier_from_iter(&mut iter, mode) } @@ -127,9 +127,9 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f let mut variant = None; let mut keyword = None; - if let (i, Some((t, start, end))) = iter.next_manual() { + if let (i, Some((start, end))) = iter.next_manual() { iter = i; - match subtags::Language::try_from_bytes_manual_slice(t, start, end) { + match subtags::Language::try_from_bytes_manual_slice(iter.slice, start, end) { Ok(l) => language = l, Err(e) => return Err(e), } @@ -139,19 +139,23 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f let mut position = ParserPosition::Script; - while let Some((t, start, end)) = iter.peek_manual() { + while let Some((start, end)) = iter.peek_manual() { if !matches!(mode, ParserMode::LanguageIdentifier) && end - start == 1 { break; } if matches!(position, ParserPosition::Script) { - if let Ok(s) = subtags::Script::try_from_bytes_manual_slice(t, start, end) { + if let Ok(s) = subtags::Script::try_from_bytes_manual_slice(iter.slice, start, end) { script = Some(s); position = ParserPosition::Region; - } else if let Ok(r) = subtags::Region::try_from_bytes_manual_slice(t, start, end) { + } else if let Ok(r) = + subtags::Region::try_from_bytes_manual_slice(iter.slice, start, end) + { region = Some(r); position = ParserPosition::Variant; - } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(t, start, end) { + } else if let Ok(v) = + subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end) + { // We cannot handle multiple variants in a const context debug_assert!(variant.is_none()); variant = Some(v); @@ -162,10 +166,12 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f return Err(ParserError::InvalidSubtag); } } else if matches!(position, ParserPosition::Region) { - if let Ok(s) = subtags::Region::try_from_bytes_manual_slice(t, start, end) { + if let Ok(s) = subtags::Region::try_from_bytes_manual_slice(iter.slice, start, end) { region = Some(s); position = ParserPosition::Variant; - } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(t, start, end) { + } else if let Ok(v) = + subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end) + { // We cannot handle multiple variants in a const context debug_assert!(variant.is_none()); variant = Some(v); @@ -175,7 +181,8 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f } else { return Err(ParserError::InvalidSubtag); } - } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(t, start, end) { + } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end) + { debug_assert!(matches!(position, ParserPosition::Variant)); if variant.is_some() { // We cannot handle multiple variants in a const context @@ -192,12 +199,12 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f } if matches!(mode, ParserMode::Locale) { - if let Some((bytes, start, end)) = iter.peek_manual() { - match ExtensionType::try_from_bytes_manual_slice(bytes, start, end) { + if let Some((start, end)) = iter.peek_manual() { + match ExtensionType::try_from_bytes_manual_slice(iter.slice, start, end) { Ok(ExtensionType::Unicode) => { iter = iter.next_manual().0; - if let Some((bytes, start, end)) = iter.peek_manual() { - if Attribute::try_from_bytes_manual_slice(bytes, start, end).is_ok() { + if let Some((start, end)) = iter.peek_manual() { + if Attribute::try_from_bytes_manual_slice(iter.slice, start, end).is_ok() { // We cannot handle Attributes in a const context return Err(ParserError::InvalidSubtag); } @@ -206,19 +213,21 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f let mut key = None; let mut current_type = None; - while let Some((bytes, start, end)) = iter.peek_manual() { + while let Some((start, end)) = iter.peek_manual() { let slen = end - start; if slen == 2 { if key.is_some() { // We cannot handle more than one Key in a const context return Err(ParserError::InvalidSubtag); } - match Key::try_from_bytes_manual_slice(bytes, start, end) { + match Key::try_from_bytes_manual_slice(iter.slice, start, end) { Ok(k) => key = Some(k), Err(e) => return Err(e), }; } else if key.is_some() { - match Value::parse_subtag_from_bytes_manual_slice(bytes, start, end) { + match Value::parse_subtag_from_bytes_manual_slice( + iter.slice, start, end, + ) { Ok(Some(t)) => { if current_type.is_some() { // We cannot handle more than one type in a const context @@ -261,7 +270,7 @@ pub const fn parse_language_identifier_with_single_variant( ), ParserError, > { - let iter = get_subtag_iterator(t); + let iter = SubtagIterator::new(t); match parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode) { Ok((l, s, r, v, _)) => Ok((l, s, r, v)), Err(e) => Err(e), diff --git a/vendor/icu_locid/src/parser/locale.rs b/vendor/icu_locid/src/parser/locale.rs index 805b6c290..175fd3a05 100644 --- a/vendor/icu_locid/src/parser/locale.rs +++ b/vendor/icu_locid/src/parser/locale.rs @@ -6,13 +6,13 @@ use tinystr::TinyAsciiStr; use crate::extensions::{self, Extensions}; use crate::parser::errors::ParserError; -use crate::parser::{get_subtag_iterator, parse_language_identifier_from_iter, ParserMode}; +use crate::parser::{parse_language_identifier_from_iter, ParserMode, SubtagIterator}; use crate::{subtags, Locale}; use super::parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter; pub fn parse_locale(t: &[u8]) -> Result<Locale, ParserError> { - let mut iter = get_subtag_iterator(t); + let mut iter = SubtagIterator::new(t); let id = parse_language_identifier_from_iter(&mut iter, ParserMode::Locale)?; let extensions = if iter.peek().is_some() { @@ -37,6 +37,6 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_keyword_ext ), ParserError, > { - let iter = get_subtag_iterator(t); + let iter = SubtagIterator::new(t); parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode) } diff --git a/vendor/icu_locid/src/parser/mod.rs b/vendor/icu_locid/src/parser/mod.rs index fef10b0ab..4b02f71c9 100644 --- a/vendor/icu_locid/src/parser/mod.rs +++ b/vendor/icu_locid/src/parser/mod.rs @@ -17,72 +17,93 @@ pub use locale::{ parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, }; -pub const fn get_subtag_iterator(slice: &[u8]) -> SubtagIterator { - let mut current_start = 0; +#[inline] +const fn is_separator(slice: &[u8], idx: usize) -> bool { #[allow(clippy::indexing_slicing)] - while current_start < slice.len() - && (slice[current_start] == b'-' || slice[current_start] == b'_') - { - current_start += 1; - } - let mut current_end = current_start; - #[allow(clippy::indexing_slicing)] - while current_end < slice.len() && slice[current_end] != b'-' && slice[current_end] != b'_' { - current_end += 1; - } - SubtagIterator { - slice, - current_start, - current_end, + let b = slice[idx]; + b == b'-' || b == b'_' +} + +const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) { + debug_assert!(idx < slice.len()); + + // This function is called only on the idx == 0 or on a separator. + let (start, mut end) = if is_separator(slice, idx) { + // If it's a separator, set the start to idx+1 and advance the idx to the next char. + (idx + 1, idx + 1) + } else { + // If it's idx=0, start is 0 and end is set to 1 + debug_assert!(idx == 0); + (0, 1) + }; + + while end < slice.len() && !is_separator(slice, end) { + // Advance until we reach end of slice or a separator. + end += 1; } + // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"` + (start, end) } +// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing. +// +// It is quite extraordinary due to focus on performance and Rust limitations for `const` +// functions. +// +// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`, +// `"en-"` etc. +// +// The iterator provides methods available for static users - `next_manual` and `peek_manual`, +// as well as typical `Peekable` iterator APIs - `next` and `peek`. +// +// All methods return an `Option` of a `Result`. #[derive(Copy, Clone, Debug)] pub struct SubtagIterator<'a> { - slice: &'a [u8], - current_start: usize, - current_end: usize, + pub slice: &'a [u8], + done: bool, + // done + subtag is faster than Option<(usize, usize)> + // at the time of writing. + subtag: (usize, usize), } -pub type ManualSlice<'a> = (&'a [u8], usize, usize); - impl<'a> SubtagIterator<'a> { - pub const fn next_manual(mut self) -> (Self, Option<ManualSlice<'a>>) { - if self.current_start == self.current_end { - (self, None) + pub const fn new(slice: &'a [u8]) -> Self { + let subtag = if slice.is_empty() || is_separator(slice, 0) { + // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"` + (0, 0) } else { - let r = (self.slice, self.current_start, self.current_end); - self.current_start = self.current_end; - #[allow(clippy::indexing_slicing)] - while self.current_start < self.slice.len() - && (self.slice[self.current_start] == b'-' - || self.slice[self.current_start] == b'_') - { - self.current_start += 1; - } - self.current_end = self.current_start; - #[allow(clippy::indexing_slicing)] - while self.current_end < self.slice.len() - && self.slice[self.current_end] != b'-' - && self.slice[self.current_end] != b'_' - { - self.current_end += 1; - } - (self, Some(r)) + get_current_subtag(slice, 0) + }; + Self { + slice, + done: false, + subtag, } } - pub const fn peek_manual(&self) -> Option<ManualSlice<'a>> { - if self.current_start == self.current_end { - None + pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) { + if self.done { + return (self, None); + } + let result = self.subtag; + if result.1 < self.slice.len() { + self.subtag = get_current_subtag(self.slice, result.1); } else { - Some((self.slice, self.current_start, self.current_end)) + self.done = true; } + (self, Some(result)) + } + + pub const fn peek_manual(&self) -> Option<(usize, usize)> { + if self.done { + return None; + } + Some(self.subtag) } pub fn peek(&self) -> Option<&'a [u8]> { #[allow(clippy::indexing_slicing)] // peek_manual returns valid indices - self.peek_manual().map(|(t, s, e)| &t[s..e]) + self.peek_manual().map(|(s, e)| &self.slice[s..e]) } } @@ -91,8 +112,120 @@ impl<'a> Iterator for SubtagIterator<'a> { fn next(&mut self) -> Option<Self::Item> { let (s, res) = self.next_manual(); - self.clone_from(&s); + *self = s; #[allow(clippy::indexing_slicing)] // next_manual returns valid indices - res.map(|(t, s, e)| &t[s..e]) + res.map(|(s, e)| &self.slice[s..e]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + fn slice_to_str(input: &[u8]) -> &str { + std::str::from_utf8(input).unwrap() + } + + #[test] + fn subtag_iterator_peek_test() { + let slice = "de_at-u-ca-foobar"; + let mut si = SubtagIterator::new(slice.as_bytes()); + + assert_eq!(si.peek().map(slice_to_str), Some("de")); + assert_eq!(si.peek().map(slice_to_str), Some("de")); + assert_eq!(si.next().map(slice_to_str), Some("de")); + + assert_eq!(si.peek().map(slice_to_str), Some("at")); + assert_eq!(si.peek().map(slice_to_str), Some("at")); + assert_eq!(si.next().map(slice_to_str), Some("at")); + } + + #[test] + fn subtag_iterator_test() { + let slice = ""; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + + let slice = "-"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + + let slice = "-en"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("en")); + assert_eq!(si.next(), None); + + let slice = "en"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]); + + let slice = "en-"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]); + + let slice = "--"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next(), None); + + let slice = "-en-"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("en")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next(), None); + + let slice = "de_at-u-ca-foobar"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!( + si.map(slice_to_str).collect::<Vec<_>>(), + vec!["de", "at", "u", "ca", "foobar",] + ); + } + + #[test] + fn get_current_subtag_test() { + let slice = "-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); + + let slice = "-en"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 3)); + + let slice = "-en-"; + let current = get_current_subtag(slice.as_bytes(), 3); + assert_eq!(current, (4, 4)); + + let slice = "en-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (0, 2)); + + let current = get_current_subtag(slice.as_bytes(), 2); + assert_eq!(current, (3, 3)); + + let slice = "en--US"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (0, 2)); + + let current = get_current_subtag(slice.as_bytes(), 2); + assert_eq!(current, (3, 3)); + + let current = get_current_subtag(slice.as_bytes(), 3); + assert_eq!(current, (4, 6)); + + let slice = "--"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); + + let current = get_current_subtag(slice.as_bytes(), 1); + assert_eq!(current, (2, 2)); + + let slice = "-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); } } |