summaryrefslogtreecommitdiffstats
path: root/vendor/icu_locid/src/parser/mod.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/icu_locid/src/parser/mod.rs')
-rw-r--r--vendor/icu_locid/src/parser/mod.rs231
1 files changed, 182 insertions, 49 deletions
diff --git a/vendor/icu_locid/src/parser/mod.rs b/vendor/icu_locid/src/parser/mod.rs
index fef10b0ab..4b02f71c9 100644
--- a/vendor/icu_locid/src/parser/mod.rs
+++ b/vendor/icu_locid/src/parser/mod.rs
@@ -17,72 +17,93 @@ pub use locale::{
parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
};
-pub const fn get_subtag_iterator(slice: &[u8]) -> SubtagIterator {
- let mut current_start = 0;
+#[inline]
+const fn is_separator(slice: &[u8], idx: usize) -> bool {
#[allow(clippy::indexing_slicing)]
- while current_start < slice.len()
- && (slice[current_start] == b'-' || slice[current_start] == b'_')
- {
- current_start += 1;
- }
- let mut current_end = current_start;
- #[allow(clippy::indexing_slicing)]
- while current_end < slice.len() && slice[current_end] != b'-' && slice[current_end] != b'_' {
- current_end += 1;
- }
- SubtagIterator {
- slice,
- current_start,
- current_end,
+ let b = slice[idx];
+ b == b'-' || b == b'_'
+}
+
+const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) {
+ debug_assert!(idx < slice.len());
+
+ // This function is called only on the idx == 0 or on a separator.
+ let (start, mut end) = if is_separator(slice, idx) {
+ // If it's a separator, set the start to idx+1 and advance the idx to the next char.
+ (idx + 1, idx + 1)
+ } else {
+ // If it's idx=0, start is 0 and end is set to 1
+ debug_assert!(idx == 0);
+ (0, 1)
+ };
+
+ while end < slice.len() && !is_separator(slice, end) {
+ // Advance until we reach end of slice or a separator.
+ end += 1;
}
+ // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"`
+ (start, end)
}
+// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
+//
+// It is quite extraordinary due to focus on performance and Rust limitations for `const`
+// functions.
+//
+// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
+// `"en-"` etc.
+//
+// The iterator provides methods available for static users - `next_manual` and `peek_manual`,
+// as well as typical `Peekable` iterator APIs - `next` and `peek`.
+//
+// All methods return an `Option` of a `Result`.
#[derive(Copy, Clone, Debug)]
pub struct SubtagIterator<'a> {
- slice: &'a [u8],
- current_start: usize,
- current_end: usize,
+ pub slice: &'a [u8],
+ done: bool,
+ // done + subtag is faster than Option<(usize, usize)>
+ // at the time of writing.
+ subtag: (usize, usize),
}
-pub type ManualSlice<'a> = (&'a [u8], usize, usize);
-
impl<'a> SubtagIterator<'a> {
- pub const fn next_manual(mut self) -> (Self, Option<ManualSlice<'a>>) {
- if self.current_start == self.current_end {
- (self, None)
+ pub const fn new(slice: &'a [u8]) -> Self {
+ let subtag = if slice.is_empty() || is_separator(slice, 0) {
+ // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"`
+ (0, 0)
} else {
- let r = (self.slice, self.current_start, self.current_end);
- self.current_start = self.current_end;
- #[allow(clippy::indexing_slicing)]
- while self.current_start < self.slice.len()
- && (self.slice[self.current_start] == b'-'
- || self.slice[self.current_start] == b'_')
- {
- self.current_start += 1;
- }
- self.current_end = self.current_start;
- #[allow(clippy::indexing_slicing)]
- while self.current_end < self.slice.len()
- && self.slice[self.current_end] != b'-'
- && self.slice[self.current_end] != b'_'
- {
- self.current_end += 1;
- }
- (self, Some(r))
+ get_current_subtag(slice, 0)
+ };
+ Self {
+ slice,
+ done: false,
+ subtag,
}
}
- pub const fn peek_manual(&self) -> Option<ManualSlice<'a>> {
- if self.current_start == self.current_end {
- None
+ pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) {
+ if self.done {
+ return (self, None);
+ }
+ let result = self.subtag;
+ if result.1 < self.slice.len() {
+ self.subtag = get_current_subtag(self.slice, result.1);
} else {
- Some((self.slice, self.current_start, self.current_end))
+ self.done = true;
}
+ (self, Some(result))
+ }
+
+ pub const fn peek_manual(&self) -> Option<(usize, usize)> {
+ if self.done {
+ return None;
+ }
+ Some(self.subtag)
}
pub fn peek(&self) -> Option<&'a [u8]> {
#[allow(clippy::indexing_slicing)] // peek_manual returns valid indices
- self.peek_manual().map(|(t, s, e)| &t[s..e])
+ self.peek_manual().map(|(s, e)| &self.slice[s..e])
}
}
@@ -91,8 +112,120 @@ impl<'a> Iterator for SubtagIterator<'a> {
fn next(&mut self) -> Option<Self::Item> {
let (s, res) = self.next_manual();
- self.clone_from(&s);
+ *self = s;
#[allow(clippy::indexing_slicing)] // next_manual returns valid indices
- res.map(|(t, s, e)| &t[s..e])
+ res.map(|(s, e)| &self.slice[s..e])
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ fn slice_to_str(input: &[u8]) -> &str {
+ std::str::from_utf8(input).unwrap()
+ }
+
+ #[test]
+ fn subtag_iterator_peek_test() {
+ let slice = "de_at-u-ca-foobar";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+
+ assert_eq!(si.peek().map(slice_to_str), Some("de"));
+ assert_eq!(si.peek().map(slice_to_str), Some("de"));
+ assert_eq!(si.next().map(slice_to_str), Some("de"));
+
+ assert_eq!(si.peek().map(slice_to_str), Some("at"));
+ assert_eq!(si.peek().map(slice_to_str), Some("at"));
+ assert_eq!(si.next().map(slice_to_str), Some("at"));
+ }
+
+ #[test]
+ fn subtag_iterator_test() {
+ let slice = "";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+
+ let slice = "-";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+
+ let slice = "-en";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next().map(slice_to_str), Some("en"));
+ assert_eq!(si.next(), None);
+
+ let slice = "en";
+ let si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
+
+ let slice = "en-";
+ let si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
+
+ let slice = "--";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next(), None);
+
+ let slice = "-en-";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next().map(slice_to_str), Some("en"));
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next(), None);
+
+ let slice = "de_at-u-ca-foobar";
+ let si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(
+ si.map(slice_to_str).collect::<Vec<_>>(),
+ vec!["de", "at", "u", "ca", "foobar",]
+ );
+ }
+
+ #[test]
+ fn get_current_subtag_test() {
+ let slice = "-";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (1, 1));
+
+ let slice = "-en";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (1, 3));
+
+ let slice = "-en-";
+ let current = get_current_subtag(slice.as_bytes(), 3);
+ assert_eq!(current, (4, 4));
+
+ let slice = "en-";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (0, 2));
+
+ let current = get_current_subtag(slice.as_bytes(), 2);
+ assert_eq!(current, (3, 3));
+
+ let slice = "en--US";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (0, 2));
+
+ let current = get_current_subtag(slice.as_bytes(), 2);
+ assert_eq!(current, (3, 3));
+
+ let current = get_current_subtag(slice.as_bytes(), 3);
+ assert_eq!(current, (4, 6));
+
+ let slice = "--";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (1, 1));
+
+ let current = get_current_subtag(slice.as_bytes(), 1);
+ assert_eq!(current, (2, 2));
+
+ let slice = "-";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (1, 1));
}
}