summaryrefslogtreecommitdiffstats
path: root/vendor/icu_locid/src/parser
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:19:50 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:19:50 +0000
commit2e00214b3efbdfeefaa0fe9e8b8fd519de7adc35 (patch)
treed325add32978dbdc1db975a438b3a77d571b1ab8 /vendor/icu_locid/src/parser
parentReleasing progress-linux version 1.68.2+dfsg1-1~progress7.99u1. (diff)
downloadrustc-2e00214b3efbdfeefaa0fe9e8b8fd519de7adc35.tar.xz
rustc-2e00214b3efbdfeefaa0fe9e8b8fd519de7adc35.zip
Merging upstream version 1.69.0+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/icu_locid/src/parser')
-rw-r--r--vendor/icu_locid/src/parser/errors.rs16
-rw-r--r--vendor/icu_locid/src/parser/langid.rs47
-rw-r--r--vendor/icu_locid/src/parser/locale.rs6
-rw-r--r--vendor/icu_locid/src/parser/mod.rs231
4 files changed, 229 insertions, 71 deletions
diff --git a/vendor/icu_locid/src/parser/errors.rs b/vendor/icu_locid/src/parser/errors.rs
index a989bcc60..5cbbb2bd4 100644
--- a/vendor/icu_locid/src/parser/errors.rs
+++ b/vendor/icu_locid/src/parser/errors.rs
@@ -48,6 +48,22 @@ pub enum ParserError {
/// ```
#[displaydoc("Invalid extension")]
InvalidExtension,
+
+ /// Duplicated extension.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// use icu::locid::Locale;
+ /// use icu::locid::ParserError;
+ ///
+ /// assert_eq!(
+ /// "und-u-hc-h12-u-ca-calendar".parse::<Locale>(),
+ /// Err(ParserError::DuplicatedExtension)
+ /// );
+ /// ```
+ #[displaydoc("Duplicated extension")]
+ DuplicatedExtension,
}
#[cfg(feature = "std")]
diff --git a/vendor/icu_locid/src/parser/langid.rs b/vendor/icu_locid/src/parser/langid.rs
index 9efa078ac..653ca7e6e 100644
--- a/vendor/icu_locid/src/parser/langid.rs
+++ b/vendor/icu_locid/src/parser/langid.rs
@@ -5,7 +5,7 @@
pub use super::errors::ParserError;
use crate::extensions::unicode::{Attribute, Key, Value};
use crate::extensions::ExtensionType;
-use crate::parser::{get_subtag_iterator, SubtagIterator};
+use crate::parser::SubtagIterator;
use crate::LanguageIdentifier;
use crate::{extensions, subtags};
use alloc::vec::Vec;
@@ -103,7 +103,7 @@ pub fn parse_language_identifier(
t: &[u8],
mode: ParserMode,
) -> Result<LanguageIdentifier, ParserError> {
- let mut iter = get_subtag_iterator(t);
+ let mut iter = SubtagIterator::new(t);
parse_language_identifier_from_iter(&mut iter, mode)
}
@@ -127,9 +127,9 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
let mut variant = None;
let mut keyword = None;
- if let (i, Some((t, start, end))) = iter.next_manual() {
+ if let (i, Some((start, end))) = iter.next_manual() {
iter = i;
- match subtags::Language::try_from_bytes_manual_slice(t, start, end) {
+ match subtags::Language::try_from_bytes_manual_slice(iter.slice, start, end) {
Ok(l) => language = l,
Err(e) => return Err(e),
}
@@ -139,19 +139,23 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
let mut position = ParserPosition::Script;
- while let Some((t, start, end)) = iter.peek_manual() {
+ while let Some((start, end)) = iter.peek_manual() {
if !matches!(mode, ParserMode::LanguageIdentifier) && end - start == 1 {
break;
}
if matches!(position, ParserPosition::Script) {
- if let Ok(s) = subtags::Script::try_from_bytes_manual_slice(t, start, end) {
+ if let Ok(s) = subtags::Script::try_from_bytes_manual_slice(iter.slice, start, end) {
script = Some(s);
position = ParserPosition::Region;
- } else if let Ok(r) = subtags::Region::try_from_bytes_manual_slice(t, start, end) {
+ } else if let Ok(r) =
+ subtags::Region::try_from_bytes_manual_slice(iter.slice, start, end)
+ {
region = Some(r);
position = ParserPosition::Variant;
- } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(t, start, end) {
+ } else if let Ok(v) =
+ subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end)
+ {
// We cannot handle multiple variants in a const context
debug_assert!(variant.is_none());
variant = Some(v);
@@ -162,10 +166,12 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
return Err(ParserError::InvalidSubtag);
}
} else if matches!(position, ParserPosition::Region) {
- if let Ok(s) = subtags::Region::try_from_bytes_manual_slice(t, start, end) {
+ if let Ok(s) = subtags::Region::try_from_bytes_manual_slice(iter.slice, start, end) {
region = Some(s);
position = ParserPosition::Variant;
- } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(t, start, end) {
+ } else if let Ok(v) =
+ subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end)
+ {
// We cannot handle multiple variants in a const context
debug_assert!(variant.is_none());
variant = Some(v);
@@ -175,7 +181,8 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
} else {
return Err(ParserError::InvalidSubtag);
}
- } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(t, start, end) {
+ } else if let Ok(v) = subtags::Variant::try_from_bytes_manual_slice(iter.slice, start, end)
+ {
debug_assert!(matches!(position, ParserPosition::Variant));
if variant.is_some() {
// We cannot handle multiple variants in a const context
@@ -192,12 +199,12 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
}
if matches!(mode, ParserMode::Locale) {
- if let Some((bytes, start, end)) = iter.peek_manual() {
- match ExtensionType::try_from_bytes_manual_slice(bytes, start, end) {
+ if let Some((start, end)) = iter.peek_manual() {
+ match ExtensionType::try_from_bytes_manual_slice(iter.slice, start, end) {
Ok(ExtensionType::Unicode) => {
iter = iter.next_manual().0;
- if let Some((bytes, start, end)) = iter.peek_manual() {
- if Attribute::try_from_bytes_manual_slice(bytes, start, end).is_ok() {
+ if let Some((start, end)) = iter.peek_manual() {
+ if Attribute::try_from_bytes_manual_slice(iter.slice, start, end).is_ok() {
// We cannot handle Attributes in a const context
return Err(ParserError::InvalidSubtag);
}
@@ -206,19 +213,21 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_extension_f
let mut key = None;
let mut current_type = None;
- while let Some((bytes, start, end)) = iter.peek_manual() {
+ while let Some((start, end)) = iter.peek_manual() {
let slen = end - start;
if slen == 2 {
if key.is_some() {
// We cannot handle more than one Key in a const context
return Err(ParserError::InvalidSubtag);
}
- match Key::try_from_bytes_manual_slice(bytes, start, end) {
+ match Key::try_from_bytes_manual_slice(iter.slice, start, end) {
Ok(k) => key = Some(k),
Err(e) => return Err(e),
};
} else if key.is_some() {
- match Value::parse_subtag_from_bytes_manual_slice(bytes, start, end) {
+ match Value::parse_subtag_from_bytes_manual_slice(
+ iter.slice, start, end,
+ ) {
Ok(Some(t)) => {
if current_type.is_some() {
// We cannot handle more than one type in a const context
@@ -261,7 +270,7 @@ pub const fn parse_language_identifier_with_single_variant(
),
ParserError,
> {
- let iter = get_subtag_iterator(t);
+ let iter = SubtagIterator::new(t);
match parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode) {
Ok((l, s, r, v, _)) => Ok((l, s, r, v)),
Err(e) => Err(e),
diff --git a/vendor/icu_locid/src/parser/locale.rs b/vendor/icu_locid/src/parser/locale.rs
index 805b6c290..175fd3a05 100644
--- a/vendor/icu_locid/src/parser/locale.rs
+++ b/vendor/icu_locid/src/parser/locale.rs
@@ -6,13 +6,13 @@ use tinystr::TinyAsciiStr;
use crate::extensions::{self, Extensions};
use crate::parser::errors::ParserError;
-use crate::parser::{get_subtag_iterator, parse_language_identifier_from_iter, ParserMode};
+use crate::parser::{parse_language_identifier_from_iter, ParserMode, SubtagIterator};
use crate::{subtags, Locale};
use super::parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter;
pub fn parse_locale(t: &[u8]) -> Result<Locale, ParserError> {
- let mut iter = get_subtag_iterator(t);
+ let mut iter = SubtagIterator::new(t);
let id = parse_language_identifier_from_iter(&mut iter, ParserMode::Locale)?;
let extensions = if iter.peek().is_some() {
@@ -37,6 +37,6 @@ pub const fn parse_locale_with_single_variant_single_keyword_unicode_keyword_ext
),
ParserError,
> {
- let iter = get_subtag_iterator(t);
+ let iter = SubtagIterator::new(t);
parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter(iter, mode)
}
diff --git a/vendor/icu_locid/src/parser/mod.rs b/vendor/icu_locid/src/parser/mod.rs
index fef10b0ab..4b02f71c9 100644
--- a/vendor/icu_locid/src/parser/mod.rs
+++ b/vendor/icu_locid/src/parser/mod.rs
@@ -17,72 +17,93 @@ pub use locale::{
parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
};
-pub const fn get_subtag_iterator(slice: &[u8]) -> SubtagIterator {
- let mut current_start = 0;
+#[inline]
+const fn is_separator(slice: &[u8], idx: usize) -> bool {
#[allow(clippy::indexing_slicing)]
- while current_start < slice.len()
- && (slice[current_start] == b'-' || slice[current_start] == b'_')
- {
- current_start += 1;
- }
- let mut current_end = current_start;
- #[allow(clippy::indexing_slicing)]
- while current_end < slice.len() && slice[current_end] != b'-' && slice[current_end] != b'_' {
- current_end += 1;
- }
- SubtagIterator {
- slice,
- current_start,
- current_end,
+ let b = slice[idx];
+ b == b'-' || b == b'_'
+}
+
+const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) {
+ debug_assert!(idx < slice.len());
+
+ // This function is called only on the idx == 0 or on a separator.
+ let (start, mut end) = if is_separator(slice, idx) {
+ // If it's a separator, set the start to idx+1 and advance the idx to the next char.
+ (idx + 1, idx + 1)
+ } else {
+ // If it's idx=0, start is 0 and end is set to 1
+ debug_assert!(idx == 0);
+ (0, 1)
+ };
+
+ while end < slice.len() && !is_separator(slice, end) {
+ // Advance until we reach end of slice or a separator.
+ end += 1;
}
+ // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"`
+ (start, end)
}
+// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
+//
+// It is quite extraordinary due to focus on performance and Rust limitations for `const`
+// functions.
+//
+// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
+// `"en-"` etc.
+//
+// The iterator provides methods available for static users - `next_manual` and `peek_manual`,
+// as well as typical `Peekable` iterator APIs - `next` and `peek`.
+//
+// All methods return an `Option` of a `Result`.
#[derive(Copy, Clone, Debug)]
pub struct SubtagIterator<'a> {
- slice: &'a [u8],
- current_start: usize,
- current_end: usize,
+ pub slice: &'a [u8],
+ done: bool,
+ // done + subtag is faster than Option<(usize, usize)>
+ // at the time of writing.
+ subtag: (usize, usize),
}
-pub type ManualSlice<'a> = (&'a [u8], usize, usize);
-
impl<'a> SubtagIterator<'a> {
- pub const fn next_manual(mut self) -> (Self, Option<ManualSlice<'a>>) {
- if self.current_start == self.current_end {
- (self, None)
+ pub const fn new(slice: &'a [u8]) -> Self {
+ let subtag = if slice.is_empty() || is_separator(slice, 0) {
+ // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"`
+ (0, 0)
} else {
- let r = (self.slice, self.current_start, self.current_end);
- self.current_start = self.current_end;
- #[allow(clippy::indexing_slicing)]
- while self.current_start < self.slice.len()
- && (self.slice[self.current_start] == b'-'
- || self.slice[self.current_start] == b'_')
- {
- self.current_start += 1;
- }
- self.current_end = self.current_start;
- #[allow(clippy::indexing_slicing)]
- while self.current_end < self.slice.len()
- && self.slice[self.current_end] != b'-'
- && self.slice[self.current_end] != b'_'
- {
- self.current_end += 1;
- }
- (self, Some(r))
+ get_current_subtag(slice, 0)
+ };
+ Self {
+ slice,
+ done: false,
+ subtag,
}
}
- pub const fn peek_manual(&self) -> Option<ManualSlice<'a>> {
- if self.current_start == self.current_end {
- None
+ pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) {
+ if self.done {
+ return (self, None);
+ }
+ let result = self.subtag;
+ if result.1 < self.slice.len() {
+ self.subtag = get_current_subtag(self.slice, result.1);
} else {
- Some((self.slice, self.current_start, self.current_end))
+ self.done = true;
}
+ (self, Some(result))
+ }
+
+ pub const fn peek_manual(&self) -> Option<(usize, usize)> {
+ if self.done {
+ return None;
+ }
+ Some(self.subtag)
}
pub fn peek(&self) -> Option<&'a [u8]> {
#[allow(clippy::indexing_slicing)] // peek_manual returns valid indices
- self.peek_manual().map(|(t, s, e)| &t[s..e])
+ self.peek_manual().map(|(s, e)| &self.slice[s..e])
}
}
@@ -91,8 +112,120 @@ impl<'a> Iterator for SubtagIterator<'a> {
fn next(&mut self) -> Option<Self::Item> {
let (s, res) = self.next_manual();
- self.clone_from(&s);
+ *self = s;
#[allow(clippy::indexing_slicing)] // next_manual returns valid indices
- res.map(|(t, s, e)| &t[s..e])
+ res.map(|(s, e)| &self.slice[s..e])
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ fn slice_to_str(input: &[u8]) -> &str {
+ std::str::from_utf8(input).unwrap()
+ }
+
+ #[test]
+ fn subtag_iterator_peek_test() {
+ let slice = "de_at-u-ca-foobar";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+
+ assert_eq!(si.peek().map(slice_to_str), Some("de"));
+ assert_eq!(si.peek().map(slice_to_str), Some("de"));
+ assert_eq!(si.next().map(slice_to_str), Some("de"));
+
+ assert_eq!(si.peek().map(slice_to_str), Some("at"));
+ assert_eq!(si.peek().map(slice_to_str), Some("at"));
+ assert_eq!(si.next().map(slice_to_str), Some("at"));
+ }
+
+ #[test]
+ fn subtag_iterator_test() {
+ let slice = "";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+
+ let slice = "-";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+
+ let slice = "-en";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next().map(slice_to_str), Some("en"));
+ assert_eq!(si.next(), None);
+
+ let slice = "en";
+ let si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
+
+ let slice = "en-";
+ let si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
+
+ let slice = "--";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next(), None);
+
+ let slice = "-en-";
+ let mut si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next().map(slice_to_str), Some("en"));
+ assert_eq!(si.next().map(slice_to_str), Some(""));
+ assert_eq!(si.next(), None);
+
+ let slice = "de_at-u-ca-foobar";
+ let si = SubtagIterator::new(slice.as_bytes());
+ assert_eq!(
+ si.map(slice_to_str).collect::<Vec<_>>(),
+ vec!["de", "at", "u", "ca", "foobar",]
+ );
+ }
+
+ #[test]
+ fn get_current_subtag_test() {
+ let slice = "-";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (1, 1));
+
+ let slice = "-en";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (1, 3));
+
+ let slice = "-en-";
+ let current = get_current_subtag(slice.as_bytes(), 3);
+ assert_eq!(current, (4, 4));
+
+ let slice = "en-";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (0, 2));
+
+ let current = get_current_subtag(slice.as_bytes(), 2);
+ assert_eq!(current, (3, 3));
+
+ let slice = "en--US";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (0, 2));
+
+ let current = get_current_subtag(slice.as_bytes(), 2);
+ assert_eq!(current, (3, 3));
+
+ let current = get_current_subtag(slice.as_bytes(), 3);
+ assert_eq!(current, (4, 6));
+
+ let slice = "--";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (1, 1));
+
+ let current = get_current_subtag(slice.as_bytes(), 1);
+ assert_eq!(current, (2, 2));
+
+ let slice = "-";
+ let current = get_current_subtag(slice.as_bytes(), 0);
+ assert_eq!(current, (1, 1));
}
}