// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). pub mod errors; mod langid; mod locale; pub use errors::ParserError; pub use langid::{ parse_language_identifier, parse_language_identifier_from_iter, parse_language_identifier_with_single_variant, parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter, ParserMode, }; pub use locale::{ parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, }; #[inline] const fn is_separator(slice: &[u8], idx: usize) -> bool { #[allow(clippy::indexing_slicing)] let b = slice[idx]; b == b'-' || b == b'_' } const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) { debug_assert!(idx < slice.len()); // This function is called only on the idx == 0 or on a separator. let (start, mut end) = if is_separator(slice, idx) { // If it's a separator, set the start to idx+1 and advance the idx to the next char. (idx + 1, idx + 1) } else { // If it's idx=0, start is 0 and end is set to 1 debug_assert!(idx == 0); (0, 1) }; while end < slice.len() && !is_separator(slice, end) { // Advance until we reach end of slice or a separator. end += 1; } // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"` (start, end) } // `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing. // // It is quite extraordinary due to focus on performance and Rust limitations for `const` // functions. // // The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`, // `"en-"` etc. // // The iterator provides methods available for static users - `next_manual` and `peek_manual`, // as well as typical `Peekable` iterator APIs - `next` and `peek`. // // All methods return an `Option` of a `Result`. #[derive(Copy, Clone, Debug)] pub struct SubtagIterator<'a> { pub slice: &'a [u8], done: bool, // done + subtag is faster than Option<(usize, usize)> // at the time of writing. subtag: (usize, usize), } impl<'a> SubtagIterator<'a> { pub const fn new(slice: &'a [u8]) -> Self { let subtag = if slice.is_empty() || is_separator(slice, 0) { // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"` (0, 0) } else { get_current_subtag(slice, 0) }; Self { slice, done: false, subtag, } } pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) { if self.done { return (self, None); } let result = self.subtag; if result.1 < self.slice.len() { self.subtag = get_current_subtag(self.slice, result.1); } else { self.done = true; } (self, Some(result)) } pub const fn peek_manual(&self) -> Option<(usize, usize)> { if self.done { return None; } Some(self.subtag) } pub fn peek(&self) -> Option<&'a [u8]> { #[allow(clippy::indexing_slicing)] // peek_manual returns valid indices self.peek_manual().map(|(s, e)| &self.slice[s..e]) } } impl<'a> Iterator for SubtagIterator<'a> { type Item = &'a [u8]; fn next(&mut self) -> Option { let (s, res) = self.next_manual(); *self = s; #[allow(clippy::indexing_slicing)] // next_manual returns valid indices res.map(|(s, e)| &self.slice[s..e]) } } #[cfg(test)] mod test { use super::*; fn slice_to_str(input: &[u8]) -> &str { std::str::from_utf8(input).unwrap() } #[test] fn subtag_iterator_peek_test() { let slice = "de_at-u-ca-foobar"; let mut si = SubtagIterator::new(slice.as_bytes()); assert_eq!(si.peek().map(slice_to_str), Some("de")); assert_eq!(si.peek().map(slice_to_str), Some("de")); assert_eq!(si.next().map(slice_to_str), Some("de")); assert_eq!(si.peek().map(slice_to_str), Some("at")); assert_eq!(si.peek().map(slice_to_str), Some("at")); assert_eq!(si.next().map(slice_to_str), Some("at")); } #[test] fn subtag_iterator_test() { let slice = ""; let mut si = SubtagIterator::new(slice.as_bytes()); assert_eq!(si.next().map(slice_to_str), Some("")); let slice = "-"; let mut si = SubtagIterator::new(slice.as_bytes()); assert_eq!(si.next().map(slice_to_str), Some("")); let slice = "-en"; let mut si = SubtagIterator::new(slice.as_bytes()); assert_eq!(si.next().map(slice_to_str), Some("")); assert_eq!(si.next().map(slice_to_str), Some("en")); assert_eq!(si.next(), None); let slice = "en"; let si = SubtagIterator::new(slice.as_bytes()); assert_eq!(si.map(slice_to_str).collect::>(), vec!["en",]); let slice = "en-"; let si = SubtagIterator::new(slice.as_bytes()); assert_eq!(si.map(slice_to_str).collect::>(), vec!["en", "",]); let slice = "--"; let mut si = SubtagIterator::new(slice.as_bytes()); assert_eq!(si.next().map(slice_to_str), Some("")); assert_eq!(si.next().map(slice_to_str), Some("")); assert_eq!(si.next().map(slice_to_str), Some("")); assert_eq!(si.next(), None); let slice = "-en-"; let mut si = SubtagIterator::new(slice.as_bytes()); assert_eq!(si.next().map(slice_to_str), Some("")); assert_eq!(si.next().map(slice_to_str), Some("en")); assert_eq!(si.next().map(slice_to_str), Some("")); assert_eq!(si.next(), None); let slice = "de_at-u-ca-foobar"; let si = SubtagIterator::new(slice.as_bytes()); assert_eq!( si.map(slice_to_str).collect::>(), vec!["de", "at", "u", "ca", "foobar",] ); } #[test] fn get_current_subtag_test() { let slice = "-"; let current = get_current_subtag(slice.as_bytes(), 0); assert_eq!(current, (1, 1)); let slice = "-en"; let current = get_current_subtag(slice.as_bytes(), 0); assert_eq!(current, (1, 3)); let slice = "-en-"; let current = get_current_subtag(slice.as_bytes(), 3); assert_eq!(current, (4, 4)); let slice = "en-"; let current = get_current_subtag(slice.as_bytes(), 0); assert_eq!(current, (0, 2)); let current = get_current_subtag(slice.as_bytes(), 2); assert_eq!(current, (3, 3)); let slice = "en--US"; let current = get_current_subtag(slice.as_bytes(), 0); assert_eq!(current, (0, 2)); let current = get_current_subtag(slice.as_bytes(), 2); assert_eq!(current, (3, 3)); let current = get_current_subtag(slice.as_bytes(), 3); assert_eq!(current, (4, 6)); let slice = "--"; let current = get_current_subtag(slice.as_bytes(), 0); assert_eq!(current, (1, 1)); let current = get_current_subtag(slice.as_bytes(), 1); assert_eq!(current, (2, 2)); let slice = "-"; let current = get_current_subtag(slice.as_bytes(), 0); assert_eq!(current, (1, 1)); } }