summaryrefslogtreecommitdiffstats
path: root/vendor/icu_list/src
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/icu_list/src')
-rw-r--r--vendor/icu_list/src/lazy_automaton.rs79
-rw-r--r--vendor/icu_list/src/lib.rs3
-rw-r--r--vendor/icu_list/src/list_formatter.rs43
-rw-r--r--vendor/icu_list/src/patterns.rs283
-rw-r--r--vendor/icu_list/src/provider.rs465
-rw-r--r--vendor/icu_list/src/provider/mod.rs261
-rw-r--r--vendor/icu_list/src/provider/serde_dfa.rs244
-rw-r--r--vendor/icu_list/src/string_matcher.rs213
8 files changed, 909 insertions, 682 deletions
diff --git a/vendor/icu_list/src/lazy_automaton.rs b/vendor/icu_list/src/lazy_automaton.rs
new file mode 100644
index 000000000..3431b3c9d
--- /dev/null
+++ b/vendor/icu_list/src/lazy_automaton.rs
@@ -0,0 +1,79 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use regex_automata::dfa::sparse::DFA;
+use regex_automata::dfa::Automaton;
+use regex_automata::util::id::StateID;
+use writeable::Writeable;
+
+pub trait LazyAutomaton: Automaton {
+ // Like Automaton::find_earliest_fwd, but doesn't require a materialized string.
+ fn matches_earliest_fwd_lazy<S: Writeable + ?Sized>(&self, haystack: &S) -> bool;
+}
+
+impl<T: AsRef<[u8]>> LazyAutomaton for DFA<T> {
+ fn matches_earliest_fwd_lazy<S: Writeable + ?Sized>(&self, haystack: &S) -> bool {
+ struct DFAStepper<'a> {
+ dfa: &'a DFA<&'a [u8]>,
+ state: StateID,
+ }
+
+ impl core::fmt::Write for DFAStepper<'_> {
+ fn write_str(&mut self, s: &str) -> core::fmt::Result {
+ for &byte in s.as_bytes() {
+ self.state = self.dfa.next_state(self.state, byte);
+ if self.dfa.is_match_state(self.state) || self.dfa.is_dead_state(self.state) {
+ // We matched or are in a no-match-cycle, return early
+ return Err(core::fmt::Error);
+ }
+ }
+ Ok(())
+ }
+ }
+
+ let mut stepper = DFAStepper {
+ // If start == 0 the start state does not depend on the actual string, so
+ // we can just pass an empty slice.
+ state: self.start_state_forward(None, &[], 0, 0),
+ dfa: &self.as_ref(),
+ };
+
+ if haystack.write_to(&mut stepper).is_ok() {
+ stepper.state = self.next_eoi_state(stepper.state);
+ }
+
+ self.is_match_state(stepper.state)
+ }
+}
+
+#[cfg(test)]
+#[test]
+fn test() {
+ use crate::provider::SerdeDFA;
+ use alloc::borrow::Cow;
+
+ let matcher = SerdeDFA::new(Cow::Borrowed("11(000)*$")).unwrap();
+
+ for writeable in [1i32, 11, 110, 11000, 211000] {
+ assert_eq!(
+ matcher
+ .deref()
+ .find_earliest_fwd(writeable.write_to_string().as_bytes())
+ .unwrap()
+ .is_some(),
+ matcher.deref().matches_earliest_fwd_lazy(&writeable)
+ );
+ }
+
+ struct ExitEarlyTest;
+
+ impl writeable::Writeable for ExitEarlyTest {
+ fn write_to<W: core::fmt::Write + ?Sized>(&self, sink: &mut W) -> core::fmt::Result {
+ sink.write_str("12")?;
+ unreachable!()
+ }
+ }
+
+ assert!(!matcher.deref().matches_earliest_fwd_lazy(&ExitEarlyTest));
+}
diff --git a/vendor/icu_list/src/lib.rs b/vendor/icu_list/src/lib.rs
index 18f2156a6..61aec0fa3 100644
--- a/vendor/icu_list/src/lib.rs
+++ b/vendor/icu_list/src/lib.rs
@@ -93,8 +93,9 @@
extern crate alloc;
mod error;
+mod lazy_automaton;
mod list_formatter;
-mod string_matcher;
+mod patterns;
pub mod provider;
diff --git a/vendor/icu_list/src/list_formatter.rs b/vendor/icu_list/src/list_formatter.rs
index 36f5fbb7b..93f035eab 100644
--- a/vendor/icu_list/src/list_formatter.rs
+++ b/vendor/icu_list/src/list_formatter.rs
@@ -72,8 +72,39 @@ impl ListFormatter {
);
/// Returns a [`Writeable`] composed of the input [`Writeable`]s and the language-dependent
- /// formatting. The first layer of parts contains [`parts::ELEMENT`] for input
- /// elements, and [`parts::LITERAL`] for list literals.
+ /// formatting.
+ ///
+ /// The [`Writeable`] is annotated with [`parts::ELEMENT`] for input elements,
+ /// and [`parts::LITERAL`] for list literals.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use icu::list::*;
+ /// # use icu::locid::locale;
+ /// # use writeable::*;
+ /// let formatteur = ListFormatter::try_new_and_with_length_unstable(
+ /// &icu_testdata::unstable(),
+ /// &locale!("fr").into(),
+ /// ListLength::Wide,
+ /// )
+ /// .unwrap();
+ /// let pays = ["Italie", "France", "Espagne", "Allemagne"];
+ ///
+ /// assert_writeable_parts_eq!(
+ /// formatteur.format(pays.iter()),
+ /// "Italie, France, Espagne et Allemagne",
+ /// [
+ /// (0, 6, parts::ELEMENT),
+ /// (6, 8, parts::LITERAL),
+ /// (8, 14, parts::ELEMENT),
+ /// (14, 16, parts::LITERAL),
+ /// (16, 23, parts::ELEMENT),
+ /// (23, 27, parts::LITERAL),
+ /// (27, 36, parts::ELEMENT),
+ /// ]
+ /// );
+ /// ```
pub fn format<'a, W: Writeable + 'a, I: Iterator<Item = W> + Clone + 'a>(
&'a self,
values: I,
@@ -99,6 +130,9 @@ pub mod parts {
use writeable::Part;
/// The [`Part`] used by [`FormattedList`](super::FormattedList) to mark the part of the string that is an element.
+ ///
+ /// * `category`: `"list"`
+ /// * `value`: `"element"`
pub const ELEMENT: Part = Part {
category: "list",
value: "element",
@@ -106,6 +140,9 @@ pub mod parts {
/// The [`Part`] used by [`FormattedList`](super::FormattedList) to mark the part of the string that is a list literal,
/// such as ", " or " and ".
+ ///
+ /// * `category`: `"list"`
+ /// * `value`: `"literal"`
pub const LITERAL: Part = Part {
category: "list",
value: "literal",
@@ -234,7 +271,7 @@ mod tests {
fn formatter(length: ListLength) -> ListFormatter {
ListFormatter {
- data: DataPayload::from_owned(crate::provider::test::test_patterns()),
+ data: DataPayload::from_owned(crate::patterns::test::test_patterns()),
length,
}
}
diff --git a/vendor/icu_list/src/patterns.rs b/vendor/icu_list/src/patterns.rs
new file mode 100644
index 000000000..8cfcb98c1
--- /dev/null
+++ b/vendor/icu_list/src/patterns.rs
@@ -0,0 +1,283 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::lazy_automaton::LazyAutomaton;
+use crate::provider::*;
+use crate::ListLength;
+#[cfg(feature = "datagen")]
+use alloc::borrow::Cow;
+#[cfg(feature = "datagen")]
+use icu_provider::DataError;
+use writeable::{LengthHint, Writeable};
+
+impl<'data> ListFormatterPatternsV1<'data> {
+ /// Creates a new [`ListFormatterPatternsV1`] from the given patterns. Fails if any pattern is invalid.
+ ///
+ /// See [`ListJoinerPattern::from_str`]. `allow_prefix` will be true for `pair` and `end` patterns,
+ /// `allow_suffix` for `start` and `pair` patterns.
+ #[cfg(feature = "datagen")]
+ pub fn try_new(
+ [start, middle, end, pair, short_start, short_middle, short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair]: [&str; 12],
+ ) -> Result<Self, DataError> {
+ Ok(Self([
+ ListJoinerPattern::from_str(start, true, false)?.into(),
+ ListJoinerPattern::from_str(middle, false, false)?.into(),
+ ListJoinerPattern::from_str(end, false, true)?.into(),
+ ListJoinerPattern::from_str(pair, true, true)?.into(),
+ ListJoinerPattern::from_str(short_start, true, false)?.into(),
+ ListJoinerPattern::from_str(short_middle, false, false)?.into(),
+ ListJoinerPattern::from_str(short_end, false, true)?.into(),
+ ListJoinerPattern::from_str(short_pair, true, true)?.into(),
+ ListJoinerPattern::from_str(narrow_start, true, false)?.into(),
+ ListJoinerPattern::from_str(narrow_middle, false, false)?.into(),
+ ListJoinerPattern::from_str(narrow_end, false, true)?.into(),
+ ListJoinerPattern::from_str(narrow_pair, true, true)?.into(),
+ ]))
+ }
+
+ /// Adds a special case to all `pattern`s that will evaluate to
+ /// `alternative_pattern` when `regex` matches the following element.
+ /// The regex is interpreted case-insensitive and anchored to the beginning, but
+ /// to improve efficiency does not search for full matches. If a full match is
+ /// required, use `$`.
+ #[cfg(feature = "datagen")]
+ pub fn make_conditional(
+ &mut self,
+ pattern: &str,
+ regex: &SerdeDFA<'static>,
+ alternative_pattern: &str,
+ ) -> Result<(), DataError> {
+ let old = ListJoinerPattern::from_str(pattern, true, true)?;
+ for i in 0..12 {
+ #[allow(clippy::indexing_slicing)] // self.0 is &[_; 12]
+ if self.0[i].default == old {
+ self.0[i].special_case = Some(SpecialCasePattern {
+ condition: regex.clone(),
+ pattern: ListJoinerPattern::from_str(
+ alternative_pattern,
+ i % 4 == 0 || i % 4 == 3, // allow_prefix = start or pair
+ i % 4 == 2 || i % 4 == 3, // allow_suffix = end or pair
+ )?,
+ });
+ }
+ }
+ Ok(())
+ }
+
+ /// The range of the number of bytes required by the list literals to join a
+ /// list of length `len`. If none of the patterns are conditional, this is exact.
+ pub(crate) fn size_hint(&self, style: ListLength, len: usize) -> LengthHint {
+ match len {
+ 0 | 1 => LengthHint::exact(0),
+ 2 => self.pair(style).size_hint(),
+ n => {
+ self.start(style).size_hint()
+ + self.middle(style).size_hint() * (n - 3)
+ + self.end(style).size_hint()
+ }
+ }
+ }
+}
+
+type PatternParts<'a> = (&'a str, &'a str, &'a str);
+
+impl<'a> ConditionalListJoinerPattern<'a> {
+ pub(crate) fn parts<'b, W: Writeable + ?Sized>(
+ &'a self,
+ following_value: &'b W,
+ ) -> PatternParts<'a> {
+ match &self.special_case {
+ Some(SpecialCasePattern { condition, pattern })
+ if condition.deref().matches_earliest_fwd_lazy(following_value) =>
+ {
+ pattern.borrow_tuple()
+ }
+ _ => self.default.borrow_tuple(),
+ }
+ }
+
+ /// The expected length of this pattern
+ fn size_hint(&'a self) -> LengthHint {
+ let mut hint = self.default.size_hint();
+ if let Some(special_case) = &self.special_case {
+ hint |= special_case.pattern.size_hint()
+ }
+ hint
+ }
+}
+
+impl<'data> ListJoinerPattern<'data> {
+ /// Construct the pattern from a CLDR pattern string
+ #[cfg(feature = "datagen")]
+ pub fn from_str(
+ pattern: &str,
+ allow_prefix: bool,
+ allow_suffix: bool,
+ ) -> Result<Self, DataError> {
+ match (pattern.find("{0}"), pattern.find("{1}")) {
+ (Some(index_0), Some(index_1))
+ if index_0 < index_1
+ && (allow_prefix || index_0 == 0)
+ && (allow_suffix || index_1 == pattern.len() - 3) =>
+ {
+ if (index_0 > 0 && !cfg!(test)) || index_1 - 3 >= 256 {
+ return Err(DataError::custom(
+ "Found valid pattern that cannot be stored in ListFormatterPatternsV1",
+ )
+ .with_debug_context(pattern));
+ }
+ #[allow(clippy::indexing_slicing)] // find
+ Ok(ListJoinerPattern {
+ string: Cow::Owned(alloc::format!(
+ "{}{}{}",
+ &pattern[0..index_0],
+ &pattern[index_0 + 3..index_1],
+ &pattern[index_1 + 3..]
+ )),
+ index_0: index_0 as u8,
+ index_1: (index_1 - 3) as u8,
+ })
+ }
+ _ => Err(DataError::custom("Invalid list pattern").with_debug_context(pattern)),
+ }
+ }
+
+ fn borrow_tuple(&'data self) -> PatternParts<'data> {
+ #![allow(clippy::indexing_slicing)] // by invariant
+ let index_0 = self.index_0 as usize;
+ let index_1 = self.index_1 as usize;
+ (
+ &self.string[0..index_0],
+ &self.string[index_0..index_1],
+ &self.string[index_1..],
+ )
+ }
+
+ fn size_hint(&self) -> LengthHint {
+ LengthHint::exact(self.string.len())
+ }
+}
+
+#[cfg(feature = "datagen")]
+impl<'data> From<ListJoinerPattern<'data>> for ConditionalListJoinerPattern<'data> {
+ fn from(default: ListJoinerPattern<'data>) -> Self {
+ Self {
+ default,
+ special_case: None,
+ }
+ }
+}
+
+#[cfg(all(test, feature = "datagen"))]
+pub mod test {
+ use super::*;
+
+ pub fn test_patterns() -> ListFormatterPatternsV1<'static> {
+ let mut patterns = ListFormatterPatternsV1::try_new([
+ // Wide: general
+ "@{0}:{1}",
+ "{0},{1}",
+ "{0}.{1}!",
+ "${0};{1}+",
+ // Short: different pattern lengths
+ "{0}1{1}",
+ "{0}12{1}",
+ "{0}12{1}34",
+ "{0}123{1}456",
+ // Narrow: conditionals
+ "{0}: {1}",
+ "{0}, {1}",
+ "{0}. {1}",
+ "{0}. {1}",
+ ])
+ .unwrap();
+ patterns
+ .make_conditional(
+ "{0}. {1}",
+ &SerdeDFA::new(Cow::Borrowed("A")).unwrap(),
+ "{0} :o {1}",
+ )
+ .unwrap();
+ patterns
+ }
+
+ #[test]
+ fn rejects_bad_patterns() {
+ assert!(ListJoinerPattern::from_str("{0} and", true, true).is_err());
+ assert!(ListJoinerPattern::from_str("and {1}", true, true).is_err());
+ assert!(ListJoinerPattern::from_str("{1} and {0}", true, true).is_err());
+ assert!(ListJoinerPattern::from_str("{1{0}}", true, true).is_err());
+ assert!(ListJoinerPattern::from_str("{0\u{202e}} and {1}", true, true).is_err());
+ assert!(ListJoinerPattern::from_str("{{0}} {{1}}", true, true).is_ok());
+
+ assert!(ListJoinerPattern::from_str("{0} and {1} ", true, true).is_ok());
+ assert!(ListJoinerPattern::from_str("{0} and {1} ", true, false).is_err());
+ assert!(ListJoinerPattern::from_str(" {0} and {1}", true, true).is_ok());
+ assert!(ListJoinerPattern::from_str(" {0} and {1}", false, true).is_err());
+ }
+
+ #[test]
+ fn produces_correct_parts() {
+ assert_eq!(
+ test_patterns().pair(ListLength::Wide).parts(""),
+ ("$", ";", "+")
+ );
+ }
+
+ #[test]
+ fn produces_correct_parts_conditionally() {
+ assert_eq!(
+ test_patterns().end(ListLength::Narrow).parts("A"),
+ ("", " :o ", "")
+ );
+ assert_eq!(
+ test_patterns().end(ListLength::Narrow).parts("a"),
+ ("", " :o ", "")
+ );
+ assert_eq!(
+ test_patterns().end(ListLength::Narrow).parts("ab"),
+ ("", " :o ", "")
+ );
+ assert_eq!(
+ test_patterns().end(ListLength::Narrow).parts("B"),
+ ("", ". ", "")
+ );
+ assert_eq!(
+ test_patterns().end(ListLength::Narrow).parts("BA"),
+ ("", ". ", "")
+ );
+ }
+
+ #[test]
+ fn size_hint_works() {
+ let pattern = test_patterns();
+
+ assert_eq!(
+ pattern.size_hint(ListLength::Short, 0),
+ LengthHint::exact(0)
+ );
+ assert_eq!(
+ pattern.size_hint(ListLength::Short, 1),
+ LengthHint::exact(0)
+ );
+
+ // pair pattern "{0}123{1}456"
+ assert_eq!(
+ pattern.size_hint(ListLength::Short, 2),
+ LengthHint::exact(6)
+ );
+
+ // patterns "{0}1{1}", "{0}12{1}" (x197), and "{0}12{1}34"
+ assert_eq!(
+ pattern.size_hint(ListLength::Short, 200),
+ LengthHint::exact(1 + 2 * 197 + 4)
+ );
+
+ // patterns "{0}: {1}", "{0}, {1}" (x197), and "{0} :o {1}" or "{0}. {1}"
+ assert_eq!(
+ pattern.size_hint(ListLength::Narrow, 200),
+ LengthHint::exact(2 + 197 * 2) + LengthHint::between(2, 4)
+ );
+ }
+}
diff --git a/vendor/icu_list/src/provider.rs b/vendor/icu_list/src/provider.rs
deleted file mode 100644
index 27f3e4fec..000000000
--- a/vendor/icu_list/src/provider.rs
+++ /dev/null
@@ -1,465 +0,0 @@
-// This file is part of ICU4X. For terms of use, please see the file
-// called LICENSE at the top level of the ICU4X source tree
-// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
-
-// Provider structs must be stable
-#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
-
-//! Data provider struct definitions for this ICU4X component.
-//!
-//! Read more about data providers: [`icu_provider`]
-
-use crate::ListLength;
-use alloc::borrow::Cow;
-use icu_provider::DataMarker;
-use icu_provider::{yoke, zerofrom};
-use writeable::{LengthHint, Writeable};
-
-pub use crate::string_matcher::StringMatcher;
-
-/// Symbols and metadata required for [`ListFormatter`](crate::ListFormatter).
-#[icu_provider::data_struct(
- AndListV1Marker = "list/and@1",
- OrListV1Marker = "list/or@1",
- UnitListV1Marker = "list/unit@1"
-)]
-#[derive(Clone, Debug)]
-#[cfg_attr(
- feature = "datagen",
- derive(serde::Serialize, databake::Bake),
- databake(path = icu_list::provider),
-)]
-pub struct ListFormatterPatternsV1<'data>(
- #[cfg_attr(feature = "datagen", serde(with = "deduplicating_array"))]
- /// The patterns in the order start, middle, end, pair, short_start, short_middle,
- /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair,
- pub [ConditionalListJoinerPattern<'data>; 12],
-);
-
-#[cfg(feature = "serde")]
-impl<'de> serde::Deserialize<'de> for ListFormatterPatternsV1<'de> {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::de::Deserializer<'de>,
- {
- #[cfg(not(feature = "serde_human"))]
- if deserializer.is_human_readable() {
- use serde::de::Error;
- return Err(D::Error::custom(
- "Deserializing human-readable ListFormatter data requires the 'serde_human' feature",
- ));
- }
-
- Ok(ListFormatterPatternsV1(deduplicating_array::deserialize(
- deserializer,
- )?))
- }
-}
-
-pub(crate) struct ErasedListV1Marker;
-
-impl DataMarker for ErasedListV1Marker {
- type Yokeable = ListFormatterPatternsV1<'static>;
-}
-
-impl<'data> ListFormatterPatternsV1<'data> {
- pub(crate) fn start(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> {
- #![allow(clippy::indexing_slicing)] // style as usize < 3
- &self.0[4 * (style as usize)]
- }
-
- pub(crate) fn middle(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> {
- #![allow(clippy::indexing_slicing)] // style as usize < 3
- &self.0[4 * (style as usize) + 1]
- }
-
- pub(crate) fn end(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> {
- #![allow(clippy::indexing_slicing)] // style as usize < 3
- &self.0[4 * (style as usize) + 2]
- }
-
- pub(crate) fn pair(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> {
- #![allow(clippy::indexing_slicing)] // style as usize < 3
- &self.0[4 * (style as usize) + 3]
- }
-
- /// The range of the number of bytes required by the list literals to join a
- /// list of length `len`. If none of the patterns are conditional, this is exact.
- pub(crate) fn size_hint(&self, style: ListLength, len: usize) -> LengthHint {
- match len {
- 0 | 1 => LengthHint::exact(0),
- 2 => self.pair(style).size_hint(),
- n => {
- self.start(style).size_hint()
- + self.middle(style).size_hint() * (n - 3)
- + self.end(style).size_hint()
- }
- }
- }
-}
-
-/// A pattern that can behave conditionally on the next element.
-#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
-#[cfg_attr(
- feature = "datagen",
- derive(serde::Serialize, databake::Bake),
- databake(path = icu_list::provider),
-)]
-#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
-pub struct ConditionalListJoinerPattern<'data> {
- /// The default pattern
- #[cfg_attr(feature = "serde", serde(borrow))]
- pub default: ListJoinerPattern<'data>,
- /// And optional special case
- #[cfg_attr(feature = "serde", serde(borrow))]
- pub special_case: Option<SpecialCasePattern<'data>>,
-}
-
-/// The special case of a [`ConditionalListJoinerPattern`]
-#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
-#[cfg_attr(
- feature = "datagen",
- derive(serde::Serialize, databake::Bake),
- databake(path = icu_list::provider),
-)]
-#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
-pub struct SpecialCasePattern<'data> {
- /// The condition on the following element
- #[cfg_attr(feature = "serde", serde(borrow))]
- pub condition: StringMatcher<'data>,
- /// The pattern if the condition matches
- #[cfg_attr(feature = "serde", serde(borrow))]
- pub pattern: ListJoinerPattern<'data>,
-}
-
-/// A pattern containing two numeric placeholders ("{0}, and {1}.")
-#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
-#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
-pub struct ListJoinerPattern<'data> {
- /// The pattern string without the placeholders
- string: Cow<'data, str>,
- /// The index of the first placeholder. Always <= index_1.
- // Always 0 for CLDR data, so we don't need to serialize it.
- // In-memory we have free space for it as index_1 doesn't
- // fill a word.
- #[cfg_attr(feature = "datagen", serde(skip))]
- index_0: u8,
- /// The index of the second placeholder. Always < string.len().
- index_1: u8,
-}
-
-#[cfg(feature = "serde")]
-impl<'de: 'data, 'data> serde::Deserialize<'de> for ListJoinerPattern<'data> {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::Deserializer<'de>,
- {
- #[derive(serde::Deserialize)]
- struct Dummy<'data> {
- #[cfg_attr(feature = "serde", serde(borrow))]
- string: Cow<'data, str>,
- index_1: u8,
- }
- let Dummy { string, index_1 } = Dummy::deserialize(deserializer)?;
-
- if index_1 as usize > string.len() {
- use serde::de::Error;
- Err(D::Error::custom("invalid index_1"))
- } else {
- Ok(ListJoinerPattern {
- string,
- index_0: 0,
- index_1,
- })
- }
- }
-}
-
-impl<'a> ListJoinerPattern<'a> {
- /// Constructs a [`ListJoinerPattern`] from raw parts. Used by databake.
- ///
- /// # Safety
- /// index_1 may be at most string.len()
- pub const unsafe fn from_parts_unchecked(string: &'a str, index_1: u8) -> Self {
- Self {
- string: Cow::Borrowed(string),
- index_0: 0,
- index_1,
- }
- }
-}
-
-pub(crate) type PatternParts<'a> = (&'a str, &'a str, &'a str);
-
-impl<'a> ConditionalListJoinerPattern<'a> {
- pub(crate) fn parts<'b, W: Writeable + ?Sized>(
- &'a self,
- following_value: &'b W,
- ) -> PatternParts<'a> {
- match &self.special_case {
- Some(SpecialCasePattern { condition, pattern })
- // TODO: Implement lookahead instead of materializing here.
- if condition.test(&*following_value.write_to_string()) =>
- {
- pattern.borrow_tuple()
- }
- _ => self.default.borrow_tuple(),
- }
- }
-
- /// The expected length of this pattern
- pub fn size_hint(&'a self) -> LengthHint {
- let mut hint = self.default.size_hint();
- if let Some(special_case) = &self.special_case {
- hint |= special_case.pattern.size_hint()
- }
- hint
- }
-}
-
-impl<'data> ListJoinerPattern<'data> {
- fn borrow_tuple(&'data self) -> PatternParts<'data> {
- #![allow(clippy::indexing_slicing)] // by invariant
- let index_0 = self.index_0 as usize;
- let index_1 = self.index_1 as usize;
- (
- &self.string[0..index_0],
- &self.string[index_0..index_1],
- &self.string[index_1..],
- )
- }
-
- fn size_hint(&self) -> LengthHint {
- LengthHint::exact(self.string.len())
- }
-}
-
-#[cfg(feature = "datagen")]
-mod datagen {
- #![allow(clippy::indexing_slicing)] // datagen
-
- use super::*;
- use icu_provider::DataError;
-
- impl<'data> ListFormatterPatternsV1<'data> {
- /// The patterns in the order start, middle, end, pair, short_start, short_middle,
- /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair,
- pub fn try_new(patterns: [&str; 12]) -> Result<Self, DataError> {
- Ok(Self([
- ListJoinerPattern::from_str(patterns[0], true, false)?.into(),
- ListJoinerPattern::from_str(patterns[1], false, false)?.into(),
- ListJoinerPattern::from_str(patterns[2], false, true)?.into(),
- ListJoinerPattern::from_str(patterns[3], true, true)?.into(),
- ListJoinerPattern::from_str(patterns[4], true, false)?.into(),
- ListJoinerPattern::from_str(patterns[5], false, false)?.into(),
- ListJoinerPattern::from_str(patterns[6], false, true)?.into(),
- ListJoinerPattern::from_str(patterns[7], true, true)?.into(),
- ListJoinerPattern::from_str(patterns[8], true, false)?.into(),
- ListJoinerPattern::from_str(patterns[9], false, false)?.into(),
- ListJoinerPattern::from_str(patterns[10], false, true)?.into(),
- ListJoinerPattern::from_str(patterns[11], true, true)?.into(),
- ]))
- }
-
- /// Adds a special case to all `pattern`s that will evaluate to
- /// `alternative_pattern` when `regex` matches the following element.
- /// The regex is interpreted case-insensitive and anchored to the beginning, but
- /// to improve efficiency does not search for full matches. If a full match is
- /// required, use `$`.
- pub fn make_conditional(
- &mut self,
- pattern: &str,
- regex: &StringMatcher<'static>,
- alternative_pattern: &str,
- ) -> Result<(), DataError> {
- let old = ListJoinerPattern::from_str(pattern, true, true)?;
- for i in 0..12 {
- if self.0[i].default == old {
- self.0[i].special_case = Some(SpecialCasePattern {
- condition: regex.clone(),
- pattern: ListJoinerPattern::from_str(
- alternative_pattern,
- i % 4 == 0 || i % 4 == 3, // allow_prefix = start or pair
- i % 4 == 2 || i % 4 == 3, // allow_suffix = end or pair
- )?,
- });
- }
- }
- Ok(())
- }
- }
-
- impl<'data> ListJoinerPattern<'data> {
- /// Construct the pattern from a CLDR pattern string
- pub fn from_str(
- pattern: &str,
- allow_prefix: bool,
- allow_suffix: bool,
- ) -> Result<Self, DataError> {
- match (pattern.find("{0}"), pattern.find("{1}")) {
- (Some(index_0), Some(index_1))
- if index_0 < index_1
- && (allow_prefix || index_0 == 0)
- && (allow_suffix || index_1 == pattern.len() - 3) =>
- {
- if (index_0 > 0 && !cfg!(test)) || index_1 - 3 >= 256 {
- return Err(DataError::custom(
- "Found valid pattern that cannot be stored in ListFormatterPatternsV1",
- )
- .with_debug_context(pattern));
- }
- Ok(ListJoinerPattern {
- string: Cow::Owned(alloc::format!(
- "{}{}{}",
- &pattern[0..index_0],
- &pattern[index_0 + 3..index_1],
- &pattern[index_1 + 3..]
- )),
- index_0: index_0 as u8,
- index_1: (index_1 - 3) as u8,
- })
- }
- _ => Err(DataError::custom("Invalid list pattern").with_debug_context(pattern)),
- }
- }
- }
-
- impl<'data> From<ListJoinerPattern<'data>> for ConditionalListJoinerPattern<'data> {
- fn from(default: ListJoinerPattern<'data>) -> Self {
- Self {
- default,
- special_case: None,
- }
- }
- }
-
- impl databake::Bake for ListJoinerPattern<'_> {
- fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
- env.insert("icu_list");
- let string = (&*self.string).bake(env);
- let index_1 = self.index_1.bake(env);
- // Safe because our own data is safe
- databake::quote! { unsafe {
- ::icu_list::provider::ListJoinerPattern::from_parts_unchecked(#string, #index_1)
- }}
- }
- }
-}
-
-#[cfg(all(test, feature = "datagen"))]
-pub(crate) mod test {
- use super::*;
-
- pub fn test_patterns() -> ListFormatterPatternsV1<'static> {
- let mut patterns = ListFormatterPatternsV1::try_new([
- // Wide: general
- "@{0}:{1}",
- "{0},{1}",
- "{0}.{1}!",
- "${0};{1}+",
- // Short: different pattern lengths
- "{0}1{1}",
- "{0}12{1}",
- "{0}12{1}34",
- "{0}123{1}456",
- // Narrow: conditionals
- "{0}: {1}",
- "{0}, {1}",
- "{0}. {1}",
- "{0}. {1}",
- ])
- .unwrap();
- patterns
- .make_conditional("{0}. {1}", &StringMatcher::new("A").unwrap(), "{0} :o {1}")
- .unwrap();
- patterns
- }
-
- #[test]
- fn rejects_bad_patterns() {
- assert!(ListJoinerPattern::from_str("{0} and", true, true).is_err());
- assert!(ListJoinerPattern::from_str("and {1}", true, true).is_err());
- assert!(ListJoinerPattern::from_str("{1} and {0}", true, true).is_err());
- assert!(ListJoinerPattern::from_str("{1{0}}", true, true).is_err());
- assert!(ListJoinerPattern::from_str("{0\u{202e}} and {1}", true, true).is_err());
- assert!(ListJoinerPattern::from_str("{{0}} {{1}}", true, true).is_ok());
-
- assert!(ListJoinerPattern::from_str("{0} and {1} ", true, true).is_ok());
- assert!(ListJoinerPattern::from_str("{0} and {1} ", true, false).is_err());
- assert!(ListJoinerPattern::from_str(" {0} and {1}", true, true).is_ok());
- assert!(ListJoinerPattern::from_str(" {0} and {1}", false, true).is_err());
- }
-
- #[test]
- fn produces_correct_parts() {
- assert_eq!(
- test_patterns().pair(ListLength::Wide).parts(""),
- ("$", ";", "+")
- );
- }
-
- #[test]
- fn produces_correct_parts_conditionally() {
- assert_eq!(
- test_patterns().end(ListLength::Narrow).parts("A"),
- ("", " :o ", "")
- );
- assert_eq!(
- test_patterns().end(ListLength::Narrow).parts("a"),
- ("", " :o ", "")
- );
- assert_eq!(
- test_patterns().end(ListLength::Narrow).parts("ab"),
- ("", " :o ", "")
- );
- assert_eq!(
- test_patterns().end(ListLength::Narrow).parts("B"),
- ("", ". ", "")
- );
- assert_eq!(
- test_patterns().end(ListLength::Narrow).parts("BA"),
- ("", ". ", "")
- );
- }
-
- #[test]
- fn size_hint_works() {
- let pattern = test_patterns();
-
- assert_eq!(
- pattern.size_hint(ListLength::Short, 0),
- LengthHint::exact(0)
- );
- assert_eq!(
- pattern.size_hint(ListLength::Short, 1),
- LengthHint::exact(0)
- );
-
- // pair pattern "{0}123{1}456"
- assert_eq!(
- pattern.size_hint(ListLength::Short, 2),
- LengthHint::exact(6)
- );
-
- // patterns "{0}1{1}", "{0}12{1}" (x197), and "{0}12{1}34"
- assert_eq!(
- pattern.size_hint(ListLength::Short, 200),
- LengthHint::exact(1 + 2 * 197 + 4)
- );
-
- // patterns "{0}: {1}", "{0}, {1}" (x197), and "{0} :o {1}" or "{0}. {1}"
- assert_eq!(
- pattern.size_hint(ListLength::Narrow, 200),
- LengthHint::exact(2 + 197 * 2) + LengthHint::between(2, 4)
- );
- }
-
- #[test]
- fn databake() {
- databake::test_bake!(
- ListJoinerPattern,
- const: unsafe { crate::provider::ListJoinerPattern::from_parts_unchecked(", ", 2u8) },
- icu_list
- );
- }
-}
diff --git a/vendor/icu_list/src/provider/mod.rs b/vendor/icu_list/src/provider/mod.rs
new file mode 100644
index 000000000..efab7c8bc
--- /dev/null
+++ b/vendor/icu_list/src/provider/mod.rs
@@ -0,0 +1,261 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+// Provider structs must be stable
+#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
+
+//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
+//!
+//! <div class="stab unstable">
+//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
+//! to be stable, their Rust representation might not be. Use with caution.
+//! </div>
+//!
+//! Read more about data providers: [`icu_provider`]
+
+use crate::ListLength;
+use alloc::borrow::Cow;
+use icu_provider::DataMarker;
+use icu_provider::{yoke, zerofrom};
+
+mod serde_dfa;
+pub use serde_dfa::SerdeDFA;
+
+/// Symbols and metadata required for [`ListFormatter`](crate::ListFormatter).
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[icu_provider::data_struct(
+ AndListV1Marker = "list/and@1",
+ OrListV1Marker = "list/or@1",
+ UnitListV1Marker = "list/unit@1"
+)]
+#[derive(Clone, Debug)]
+#[cfg_attr(
+ feature = "datagen",
+ derive(serde::Serialize, databake::Bake),
+ databake(path = icu_list::provider),
+)]
+pub struct ListFormatterPatternsV1<'data>(
+ #[cfg_attr(feature = "datagen", serde(with = "deduplicating_array"))]
+ /// The patterns in the order start, middle, end, pair, short_start, short_middle,
+ /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair,
+ pub [ConditionalListJoinerPattern<'data>; 12],
+);
+
+#[cfg(feature = "serde")]
+impl<'de> serde::Deserialize<'de> for ListFormatterPatternsV1<'de> {
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+ where
+ D: serde::de::Deserializer<'de>,
+ {
+ #[cfg(not(feature = "serde_human"))]
+ if deserializer.is_human_readable() {
+ use serde::de::Error;
+ return Err(D::Error::custom(
+ "Deserializing human-readable ListFormatter data requires the 'serde_human' feature",
+ ));
+ }
+
+ Ok(ListFormatterPatternsV1(deduplicating_array::deserialize(
+ deserializer,
+ )?))
+ }
+}
+
+pub(crate) struct ErasedListV1Marker;
+
+impl DataMarker for ErasedListV1Marker {
+ type Yokeable = ListFormatterPatternsV1<'static>;
+}
+
+impl<'data> ListFormatterPatternsV1<'data> {
+ pub(crate) fn start(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> {
+ #![allow(clippy::indexing_slicing)] // style as usize < 3
+ &self.0[4 * (style as usize)]
+ }
+
+ pub(crate) fn middle(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> {
+ #![allow(clippy::indexing_slicing)] // style as usize < 3
+ &self.0[4 * (style as usize) + 1]
+ }
+
+ pub(crate) fn end(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> {
+ #![allow(clippy::indexing_slicing)] // style as usize < 3
+ &self.0[4 * (style as usize) + 2]
+ }
+
+ pub(crate) fn pair(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> {
+ #![allow(clippy::indexing_slicing)] // style as usize < 3
+ &self.0[4 * (style as usize) + 3]
+ }
+}
+
+/// A pattern that can behave conditionally on the next element.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(
+ feature = "datagen",
+ derive(PartialEq, serde::Serialize, databake::Bake),
+ databake(path = icu_list::provider),
+)]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct ConditionalListJoinerPattern<'data> {
+ /// The default pattern
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub default: ListJoinerPattern<'data>,
+ /// And optional special case
+ #[cfg_attr(
+ feature = "serde",
+ serde(borrow, deserialize_with = "SpecialCasePattern::deserialize_option")
+ )]
+ pub special_case: Option<SpecialCasePattern<'data>>,
+}
+
+/// The special case of a [`ConditionalListJoinerPattern`]
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(
+ feature = "datagen",
+ derive(PartialEq, serde::Serialize, databake::Bake),
+ databake(path = icu_list::provider),
+)]
+pub struct SpecialCasePattern<'data> {
+ /// The condition on the following element
+ pub condition: SerdeDFA<'data>,
+ /// The pattern if the condition matches
+ pub pattern: ListJoinerPattern<'data>,
+}
+
+#[cfg(feature = "serde")]
+impl<'data> SpecialCasePattern<'data> {
+ // If the condition doesn't deserialize, the whole special case becomes `None`
+ fn deserialize_option<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error>
+ where
+ D: serde::de::Deserializer<'de>,
+ {
+ use serde::Deserialize;
+
+ #[derive(Deserialize)]
+ struct SpecialCasePatternOptionalDfa<'data> {
+ #[cfg_attr(
+ feature = "serde",
+ serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")
+ )]
+ pub condition: Option<SerdeDFA<'data>>,
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub pattern: ListJoinerPattern<'data>,
+ }
+
+ Ok(
+ match Option::<SpecialCasePatternOptionalDfa<'data>>::deserialize(deserializer)? {
+ Some(SpecialCasePatternOptionalDfa {
+ condition: Some(condition),
+ pattern,
+ }) => Some(SpecialCasePattern { condition, pattern }),
+ _ => None,
+ },
+ )
+ }
+}
+
+/// A pattern containing two numeric placeholders ("{0}, and {1}.")
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
+pub struct ListJoinerPattern<'data> {
+ /// The pattern string without the placeholders
+ pub(crate) string: Cow<'data, str>,
+ /// The index of the first placeholder. Always <= index_1.
+ // Always 0 for CLDR data, so we don't need to serialize it.
+ // In-memory we have free space for it as index_1 doesn't
+ // fill a word.
+ #[cfg_attr(feature = "datagen", serde(skip))]
+ pub(crate) index_0: u8,
+ /// The index of the second placeholder. Always < string.len().
+ pub(crate) index_1: u8,
+}
+
+#[cfg(feature = "serde")]
+impl<'de: 'data, 'data> serde::Deserialize<'de> for ListJoinerPattern<'data> {
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+ where
+ D: serde::Deserializer<'de>,
+ {
+ #[derive(serde::Deserialize)]
+ struct Dummy<'data> {
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ string: Cow<'data, str>,
+ index_1: u8,
+ }
+ let Dummy { string, index_1 } = Dummy::deserialize(deserializer)?;
+
+ if index_1 as usize > string.len() {
+ use serde::de::Error;
+ Err(D::Error::custom("invalid index_1"))
+ } else {
+ Ok(ListJoinerPattern {
+ string,
+ index_0: 0,
+ index_1,
+ })
+ }
+ }
+}
+
+impl<'a> ListJoinerPattern<'a> {
+ /// Constructs a [`ListJoinerPattern`] from raw parts. Used by databake.
+ ///
+ /// # Safety
+ /// index_1 may be at most string.len()
+ pub const unsafe fn from_parts_unchecked(string: &'a str, index_1: u8) -> Self {
+ Self {
+ string: Cow::Borrowed(string),
+ index_0: 0,
+ index_1,
+ }
+ }
+}
+
+#[cfg(feature = "datagen")]
+impl databake::Bake for ListJoinerPattern<'_> {
+ fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
+ env.insert("icu_list");
+ let string = (&*self.string).bake(env);
+ let index_1 = self.index_1.bake(env);
+ // Safe because our own data is safe
+ databake::quote! { unsafe {
+ ::icu_list::provider::ListJoinerPattern::from_parts_unchecked(#string, #index_1)
+ }}
+ }
+}
+
+#[cfg(all(test, feature = "datagen"))]
+#[test]
+fn databake() {
+ databake::test_bake!(
+ ListJoinerPattern,
+ const: unsafe { crate::provider::ListJoinerPattern::from_parts_unchecked(", ", 2u8) },
+ icu_list
+ );
+}
diff --git a/vendor/icu_list/src/provider/serde_dfa.rs b/vendor/icu_list/src/provider/serde_dfa.rs
new file mode 100644
index 000000000..e2424e1e9
--- /dev/null
+++ b/vendor/icu_list/src/provider/serde_dfa.rs
@@ -0,0 +1,244 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use alloc::borrow::Cow;
+use icu_provider::{yoke, zerofrom};
+use regex_automata::dfa::sparse::DFA;
+
+/// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement
+/// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian
+/// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
+pub struct SerdeDFA<'data> {
+ // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok())
+ dfa_bytes: Cow<'data, [u8]>,
+ pattern: Option<Cow<'data, str>>,
+}
+
+#[cfg(feature = "datagen")]
+impl PartialEq for SerdeDFA<'_> {
+ fn eq(&self, other: &Self) -> bool {
+ self.dfa_bytes == other.dfa_bytes
+ }
+}
+
+#[cfg(feature = "datagen")]
+impl databake::Bake for SerdeDFA<'_> {
+ fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
+ env.insert("icu_list");
+ let le_bytes = self.deref().to_bytes_little_endian().as_slice().bake(env);
+ let be_bytes = self.deref().to_bytes_big_endian().as_slice().bake(env);
+ // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant.
+ databake::quote! {
+ unsafe {
+ ::icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked(
+ if cfg!(target_endian = "little") {
+ &#le_bytes
+ } else {
+ &#be_bytes
+ }
+ )
+ }
+ }
+ }
+}
+
+#[cfg(feature = "datagen")]
+impl serde::Serialize for SerdeDFA<'_> {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::ser::Serializer,
+ {
+ if serializer.is_human_readable() {
+ self.pattern
+ .as_ref()
+ .map(|pattern| pattern.serialize(serializer))
+ .unwrap_or_else(|| {
+ use serde::ser::Error;
+ Err(S::Error::custom(
+ "cannot serialize a deserialized bincode SerdeDFA to JSON",
+ ))
+ })
+ } else {
+ self.deref().to_bytes_little_endian().serialize(serializer)
+ }
+ }
+}
+
+#[cfg(feature = "serde")]
+impl<'data> SerdeDFA<'data> {
+ /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization
+ /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive.
+ pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error>
+ where
+ D: serde::de::Deserializer<'de>,
+ {
+ use icu_provider::serde::borrow_de_utils::CowBytesWrap;
+ use serde::Deserialize;
+
+ #[cfg(feature = "serde_human")]
+ if deserializer.is_human_readable() {
+ #[cfg(not(feature = "std"))]
+ use alloc::string::ToString;
+ use serde::de::Error;
+ return SerdeDFA::new(Cow::<str>::deserialize(deserializer)?)
+ .map(Some)
+ .map_err(|e| D::Error::custom(e.to_string()));
+ }
+
+ let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0;
+
+ if cfg!(target_endian = "big") {
+ return Ok(None);
+ }
+
+ // Verify safety invariant
+ DFA::from_bytes(&dfa_bytes).map_err(|e| {
+ use serde::de::Error;
+ D::Error::custom(alloc::format!("Invalid DFA bytes: {}", e))
+ })?;
+
+ Ok(Some(SerdeDFA {
+ dfa_bytes,
+ pattern: None,
+ }))
+ }
+}
+
+impl<'data> SerdeDFA<'data> {
+ /// Creates a `SerdeDFA` from raw bytes. Used internally by databake.
+ ///
+ /// # Safety
+ ///
+ /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok())
+ pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self {
+ Self {
+ dfa_bytes: Cow::Borrowed(dfa_bytes),
+ pattern: None,
+ }
+ }
+
+ /// Creates a `SerdeDFA` from a regex.
+ #[cfg(any(feature = "datagen", feature = "serde_human",))]
+ pub fn new(pattern: Cow<'data, str>) -> Result<Self, icu_provider::DataError> {
+ use regex_automata::{
+ dfa::dense::{Builder, Config},
+ SyntaxConfig,
+ };
+
+ let mut builder = Builder::new();
+ let dfa = builder
+ .syntax(SyntaxConfig::new().case_insensitive(true))
+ .configure(Config::new().anchored(true).minimize(true))
+ .build(&pattern)
+ .map_err(|_| {
+ icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern)
+ })?
+ .to_sparse()
+ .map_err(|_| {
+ icu_provider::DataError::custom("Cannot sparsify DFA")
+ .with_display_context(&pattern)
+ })?;
+
+ Ok(Self {
+ dfa_bytes: dfa.to_bytes_native_endian().into(),
+ pattern: Some(pattern),
+ })
+ }
+
+ /// Returns the represented [`DFA`]
+ #[allow(clippy::unwrap_used)] // by invariant
+ pub fn deref(&'data self) -> DFA<&'data [u8]> {
+ // Safe due to struct invariant.
+ unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 }
+ }
+}
+
+#[cfg(all(test, feature = "datagen"))]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_serde_dfa() {
+ use regex_automata::dfa::Automaton;
+
+ let matcher = SerdeDFA::new(Cow::Borrowed("abc")).unwrap();
+
+ assert!(matcher.deref().find_earliest_fwd(b"ab").unwrap().is_none());
+ assert!(matcher.deref().find_earliest_fwd(b"abc").unwrap().is_some());
+ assert!(matcher
+ .deref()
+ .find_earliest_fwd(b"abcde")
+ .unwrap()
+ .is_some());
+ assert!(matcher
+ .deref()
+ .find_earliest_fwd(b" abcde")
+ .unwrap()
+ .is_none());
+ }
+
+ #[derive(serde::Deserialize)]
+ struct OptionSerdeDFA<'data>(
+ #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")] Option<SerdeDFA<'data>>,
+ );
+
+ #[test]
+ #[cfg(target_endian = "little")]
+ fn test_postcard_serialization() {
+ let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
+
+ let mut bytes = postcard::to_stdvec(&matcher).unwrap();
+ assert_eq!(
+ postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0,
+ Some(matcher)
+ );
+
+ // A corrupted byte leads to an error
+ bytes[17] ^= 255;
+ assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
+ bytes[17] ^= 255;
+
+ // An extra byte leads to an error
+ bytes.insert(123, 40);
+ assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
+ bytes.remove(123);
+
+ // Missing bytes lead to an error
+ assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err());
+ }
+
+ #[test]
+ #[cfg(feature = "serde_human")]
+ fn test_json_serialization() {
+ let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
+
+ let json = serde_json::to_string(&matcher).unwrap();
+ assert_eq!(
+ serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0,
+ Some(matcher)
+ );
+ assert!(serde_json::from_str::<OptionSerdeDFA>(".*[").is_err());
+ }
+
+ #[test]
+ #[ignore] // https://github.com/rust-lang/rust/issues/98906
+ fn databake() {
+ databake::test_bake!(
+ SerdeDFA,
+ const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little") {
+ &[1] // TODO: set this when activating the test
+ } else {
+ &[2] // TODO: set this when activating the test
+ })},
+ icu_list
+ );
+ }
+}
diff --git a/vendor/icu_list/src/string_matcher.rs b/vendor/icu_list/src/string_matcher.rs
deleted file mode 100644
index ba4833605..000000000
--- a/vendor/icu_list/src/string_matcher.rs
+++ /dev/null
@@ -1,213 +0,0 @@
-// This file is part of ICU4X. For terms of use, please see the file
-// called LICENSE at the top level of the ICU4X source tree
-// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
-
-use alloc::borrow::Cow;
-#[cfg(any(feature = "serde_human", feature = "datagen"))]
-use alloc::string::ToString;
-use icu_provider::{yoke, zerofrom};
-use regex_automata::dfa::sparse::DFA;
-use regex_automata::dfa::Automaton;
-
-/// A precompiled regex
-#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
-#[allow(clippy::exhaustive_structs)] // not a public API
-pub struct StringMatcher<'data> {
- // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok())
- dfa_bytes: Cow<'data, [u8]>,
- pattern: Option<Cow<'data, str>>,
-}
-
-impl PartialEq for StringMatcher<'_> {
- fn eq(&self, other: &Self) -> bool {
- self.dfa_bytes == other.dfa_bytes
- }
-}
-
-#[cfg(feature = "datagen")]
-impl databake::Bake for StringMatcher<'_> {
- fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
- env.insert("icu_list");
- let bytes = (&&*self.dfa_bytes).bake(env);
- // Safe because our own data is safe
- databake::quote! {
- unsafe { ::icu_list::provider::StringMatcher::from_dfa_bytes_unchecked(#bytes) }
- }
- }
-}
-
-#[cfg(feature = "datagen")]
-impl serde::Serialize for StringMatcher<'_> {
- fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
- where
- S: serde::ser::Serializer,
- {
- if serializer.is_human_readable() {
- self.pattern
- .as_ref()
- .map(|pattern| pattern.serialize(serializer))
- .unwrap_or_else(|| {
- use serde::ser::Error;
- Err(S::Error::custom(
- "cannot serialize a deserialized bincode StringMatcher to JSON",
- ))
- })
- } else {
- self.dfa_bytes.serialize(serializer)
- }
- }
-}
-
-#[cfg(feature = "serde")]
-impl<'de: 'data, 'data> serde::Deserialize<'de> for StringMatcher<'data> {
- fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
- where
- D: serde::de::Deserializer<'de>,
- {
- use icu_provider::serde::borrow_de_utils::CowBytesWrap;
-
- #[cfg(feature = "serde_human")]
- if deserializer.is_human_readable() {
- use serde::de::Error;
- return StringMatcher::new(<&str>::deserialize(deserializer)?)
- .map_err(|e| D::Error::custom(e.to_string()));
- }
-
- if cfg!(target_endian = "big") {
- // TODO: Convert LE to BE. For now we just behave like the
- // accept-nothing DFA on BE systems.
- return Ok(StringMatcher {
- dfa_bytes: Cow::Borrowed(&[]),
- pattern: None,
- });
- }
-
- let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0;
-
- // Verify safety invariant
- DFA::from_bytes(&dfa_bytes).map_err(|e| {
- use serde::de::Error;
- D::Error::custom(alloc::format!("Invalid DFA bytes: {}", e))
- })?;
-
- Ok(StringMatcher {
- dfa_bytes,
- pattern: None,
- })
- }
-}
-
-impl<'data> StringMatcher<'data> {
- /// Creates a `StringMatcher` from a serialized DFA. Used internally by databake.
- ///
- /// # Safety
- ///
- /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok())
- pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self {
- Self {
- dfa_bytes: Cow::Borrowed(dfa_bytes),
- pattern: None,
- }
- }
-
- /// Creates a `StringMatcher` from regex.
- #[cfg(any(feature = "datagen", feature = "serde_human",))]
- pub fn new(pattern: &str) -> Result<Self, icu_provider::DataError> {
- use regex_automata::{
- dfa::dense::{Builder, Config},
- SyntaxConfig,
- };
-
- let mut builder = Builder::new();
- let dfa = builder
- .syntax(SyntaxConfig::new().case_insensitive(true))
- .configure(Config::new().anchored(true).minimize(true))
- .build(pattern)
- .map_err(|_| {
- icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern)
- })?
- .to_sparse()
- .map_err(|_| {
- icu_provider::DataError::custom("Cannot sparsify DFA")
- .with_display_context(&pattern)
- })?;
-
- Ok(Self {
- dfa_bytes: dfa.to_bytes_little_endian().into(),
- pattern: Some(pattern.to_string().into()),
- })
- }
-
- #[allow(clippy::unwrap_used)] // by invariant
- pub(crate) fn test(&self, string: &str) -> bool {
- cfg!(target_endian = "little")
- && matches!(
- // Safe due to struct invariant.
- unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 }
- .find_earliest_fwd(string.as_bytes()),
- Ok(Some(_))
- )
- }
-}
-
-#[cfg(all(test, feature = "datagen"))]
-mod test {
- use super::*;
-
- #[test]
- fn test_string_matcher() {
- let matcher = StringMatcher::new("abc.*").unwrap();
- assert!(!matcher.test("ab"));
- assert!(matcher.test("abc"));
- assert!(matcher.test("abcde"));
- }
-
- #[test]
- fn test_postcard_serialization() {
- let matcher = StringMatcher::new("abc*").unwrap();
-
- let mut bytes = postcard::to_stdvec(&matcher).unwrap();
- assert_eq!(
- postcard::from_bytes::<StringMatcher>(&bytes).unwrap(),
- matcher
- );
-
- // A corrupted byte leads to an error
- bytes[17] ^= 255;
- assert!(postcard::from_bytes::<StringMatcher>(&bytes).is_err());
- bytes[17] ^= 255;
-
- // An extra byte leads to an error
- bytes.insert(123, 40);
- assert!(postcard::from_bytes::<StringMatcher>(&bytes).is_err());
- bytes.remove(123);
-
- // Missing bytes lead to an error
- assert!(postcard::from_bytes::<StringMatcher>(&bytes[0..bytes.len() - 5]).is_err());
- }
-
- #[test]
- #[cfg(feature = "serde_human")]
- fn test_json_serialization() {
- let matcher = StringMatcher::new("abc*").unwrap();
-
- let json = serde_json::to_string(&matcher).unwrap();
- assert_eq!(
- serde_json::from_str::<StringMatcher>(&json).unwrap(),
- matcher
- );
- assert!(serde_json::from_str::<StringMatcher>(".*[").is_err());
- }
-
- #[test]
- #[ignore] // https://github.com/rust-lang/rust/issues/98906
- fn databake() {
- databake::test_bake!(
- StringMatcher,
- const: unsafe {
- crate::provider::StringMatcher::from_dfa_bytes_unchecked(&[49u8, 50u8, 51u8, ])
- },
- icu_list
- );
- }
-}