diff options
Diffstat (limited to 'vendor/icu_locid/src/parser/mod.rs')
-rw-r--r-- | vendor/icu_locid/src/parser/mod.rs | 231 |
1 files changed, 182 insertions, 49 deletions
diff --git a/vendor/icu_locid/src/parser/mod.rs b/vendor/icu_locid/src/parser/mod.rs index fef10b0ab..4b02f71c9 100644 --- a/vendor/icu_locid/src/parser/mod.rs +++ b/vendor/icu_locid/src/parser/mod.rs @@ -17,72 +17,93 @@ pub use locale::{ parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, }; -pub const fn get_subtag_iterator(slice: &[u8]) -> SubtagIterator { - let mut current_start = 0; +#[inline] +const fn is_separator(slice: &[u8], idx: usize) -> bool { #[allow(clippy::indexing_slicing)] - while current_start < slice.len() - && (slice[current_start] == b'-' || slice[current_start] == b'_') - { - current_start += 1; - } - let mut current_end = current_start; - #[allow(clippy::indexing_slicing)] - while current_end < slice.len() && slice[current_end] != b'-' && slice[current_end] != b'_' { - current_end += 1; - } - SubtagIterator { - slice, - current_start, - current_end, + let b = slice[idx]; + b == b'-' || b == b'_' +} + +const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) { + debug_assert!(idx < slice.len()); + + // This function is called only on the idx == 0 or on a separator. + let (start, mut end) = if is_separator(slice, idx) { + // If it's a separator, set the start to idx+1 and advance the idx to the next char. + (idx + 1, idx + 1) + } else { + // If it's idx=0, start is 0 and end is set to 1 + debug_assert!(idx == 0); + (0, 1) + }; + + while end < slice.len() && !is_separator(slice, end) { + // Advance until we reach end of slice or a separator. + end += 1; } + // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"` + (start, end) } +// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing. +// +// It is quite extraordinary due to focus on performance and Rust limitations for `const` +// functions. +// +// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`, +// `"en-"` etc. +// +// The iterator provides methods available for static users - `next_manual` and `peek_manual`, +// as well as typical `Peekable` iterator APIs - `next` and `peek`. +// +// All methods return an `Option` of a `Result`. #[derive(Copy, Clone, Debug)] pub struct SubtagIterator<'a> { - slice: &'a [u8], - current_start: usize, - current_end: usize, + pub slice: &'a [u8], + done: bool, + // done + subtag is faster than Option<(usize, usize)> + // at the time of writing. + subtag: (usize, usize), } -pub type ManualSlice<'a> = (&'a [u8], usize, usize); - impl<'a> SubtagIterator<'a> { - pub const fn next_manual(mut self) -> (Self, Option<ManualSlice<'a>>) { - if self.current_start == self.current_end { - (self, None) + pub const fn new(slice: &'a [u8]) -> Self { + let subtag = if slice.is_empty() || is_separator(slice, 0) { + // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"` + (0, 0) } else { - let r = (self.slice, self.current_start, self.current_end); - self.current_start = self.current_end; - #[allow(clippy::indexing_slicing)] - while self.current_start < self.slice.len() - && (self.slice[self.current_start] == b'-' - || self.slice[self.current_start] == b'_') - { - self.current_start += 1; - } - self.current_end = self.current_start; - #[allow(clippy::indexing_slicing)] - while self.current_end < self.slice.len() - && self.slice[self.current_end] != b'-' - && self.slice[self.current_end] != b'_' - { - self.current_end += 1; - } - (self, Some(r)) + get_current_subtag(slice, 0) + }; + Self { + slice, + done: false, + subtag, } } - pub const fn peek_manual(&self) -> Option<ManualSlice<'a>> { - if self.current_start == self.current_end { - None + pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) { + if self.done { + return (self, None); + } + let result = self.subtag; + if result.1 < self.slice.len() { + self.subtag = get_current_subtag(self.slice, result.1); } else { - Some((self.slice, self.current_start, self.current_end)) + self.done = true; } + (self, Some(result)) + } + + pub const fn peek_manual(&self) -> Option<(usize, usize)> { + if self.done { + return None; + } + Some(self.subtag) } pub fn peek(&self) -> Option<&'a [u8]> { #[allow(clippy::indexing_slicing)] // peek_manual returns valid indices - self.peek_manual().map(|(t, s, e)| &t[s..e]) + self.peek_manual().map(|(s, e)| &self.slice[s..e]) } } @@ -91,8 +112,120 @@ impl<'a> Iterator for SubtagIterator<'a> { fn next(&mut self) -> Option<Self::Item> { let (s, res) = self.next_manual(); - self.clone_from(&s); + *self = s; #[allow(clippy::indexing_slicing)] // next_manual returns valid indices - res.map(|(t, s, e)| &t[s..e]) + res.map(|(s, e)| &self.slice[s..e]) + } +} + +#[cfg(test)] +mod test { + use super::*; + + fn slice_to_str(input: &[u8]) -> &str { + std::str::from_utf8(input).unwrap() + } + + #[test] + fn subtag_iterator_peek_test() { + let slice = "de_at-u-ca-foobar"; + let mut si = SubtagIterator::new(slice.as_bytes()); + + assert_eq!(si.peek().map(slice_to_str), Some("de")); + assert_eq!(si.peek().map(slice_to_str), Some("de")); + assert_eq!(si.next().map(slice_to_str), Some("de")); + + assert_eq!(si.peek().map(slice_to_str), Some("at")); + assert_eq!(si.peek().map(slice_to_str), Some("at")); + assert_eq!(si.next().map(slice_to_str), Some("at")); + } + + #[test] + fn subtag_iterator_test() { + let slice = ""; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + + let slice = "-"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + + let slice = "-en"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("en")); + assert_eq!(si.next(), None); + + let slice = "en"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]); + + let slice = "en-"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]); + + let slice = "--"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next(), None); + + let slice = "-en-"; + let mut si = SubtagIterator::new(slice.as_bytes()); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next().map(slice_to_str), Some("en")); + assert_eq!(si.next().map(slice_to_str), Some("")); + assert_eq!(si.next(), None); + + let slice = "de_at-u-ca-foobar"; + let si = SubtagIterator::new(slice.as_bytes()); + assert_eq!( + si.map(slice_to_str).collect::<Vec<_>>(), + vec!["de", "at", "u", "ca", "foobar",] + ); + } + + #[test] + fn get_current_subtag_test() { + let slice = "-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); + + let slice = "-en"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 3)); + + let slice = "-en-"; + let current = get_current_subtag(slice.as_bytes(), 3); + assert_eq!(current, (4, 4)); + + let slice = "en-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (0, 2)); + + let current = get_current_subtag(slice.as_bytes(), 2); + assert_eq!(current, (3, 3)); + + let slice = "en--US"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (0, 2)); + + let current = get_current_subtag(slice.as_bytes(), 2); + assert_eq!(current, (3, 3)); + + let current = get_current_subtag(slice.as_bytes(), 3); + assert_eq!(current, (4, 6)); + + let slice = "--"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); + + let current = get_current_subtag(slice.as_bytes(), 1); + assert_eq!(current, (2, 2)); + + let slice = "-"; + let current = get_current_subtag(slice.as_bytes(), 0); + assert_eq!(current, (1, 1)); } } |