From 2e00214b3efbdfeefaa0fe9e8b8fd519de7adc35 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:19:50 +0200 Subject: Merging upstream version 1.69.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/icu_list/src/lazy_automaton.rs | 79 +++++ vendor/icu_list/src/lib.rs | 3 +- vendor/icu_list/src/list_formatter.rs | 43 ++- vendor/icu_list/src/patterns.rs | 283 ++++++++++++++++++ vendor/icu_list/src/provider.rs | 465 ------------------------------ vendor/icu_list/src/provider/mod.rs | 261 +++++++++++++++++ vendor/icu_list/src/provider/serde_dfa.rs | 244 ++++++++++++++++ vendor/icu_list/src/string_matcher.rs | 213 -------------- 8 files changed, 909 insertions(+), 682 deletions(-) create mode 100644 vendor/icu_list/src/lazy_automaton.rs create mode 100644 vendor/icu_list/src/patterns.rs delete mode 100644 vendor/icu_list/src/provider.rs create mode 100644 vendor/icu_list/src/provider/mod.rs create mode 100644 vendor/icu_list/src/provider/serde_dfa.rs delete mode 100644 vendor/icu_list/src/string_matcher.rs (limited to 'vendor/icu_list/src') diff --git a/vendor/icu_list/src/lazy_automaton.rs b/vendor/icu_list/src/lazy_automaton.rs new file mode 100644 index 000000000..3431b3c9d --- /dev/null +++ b/vendor/icu_list/src/lazy_automaton.rs @@ -0,0 +1,79 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use regex_automata::dfa::sparse::DFA; +use regex_automata::dfa::Automaton; +use regex_automata::util::id::StateID; +use writeable::Writeable; + +pub trait LazyAutomaton: Automaton { + // Like Automaton::find_earliest_fwd, but doesn't require a materialized string. + fn matches_earliest_fwd_lazy(&self, haystack: &S) -> bool; +} + +impl> LazyAutomaton for DFA { + fn matches_earliest_fwd_lazy(&self, haystack: &S) -> bool { + struct DFAStepper<'a> { + dfa: &'a DFA<&'a [u8]>, + state: StateID, + } + + impl core::fmt::Write for DFAStepper<'_> { + fn write_str(&mut self, s: &str) -> core::fmt::Result { + for &byte in s.as_bytes() { + self.state = self.dfa.next_state(self.state, byte); + if self.dfa.is_match_state(self.state) || self.dfa.is_dead_state(self.state) { + // We matched or are in a no-match-cycle, return early + return Err(core::fmt::Error); + } + } + Ok(()) + } + } + + let mut stepper = DFAStepper { + // If start == 0 the start state does not depend on the actual string, so + // we can just pass an empty slice. + state: self.start_state_forward(None, &[], 0, 0), + dfa: &self.as_ref(), + }; + + if haystack.write_to(&mut stepper).is_ok() { + stepper.state = self.next_eoi_state(stepper.state); + } + + self.is_match_state(stepper.state) + } +} + +#[cfg(test)] +#[test] +fn test() { + use crate::provider::SerdeDFA; + use alloc::borrow::Cow; + + let matcher = SerdeDFA::new(Cow::Borrowed("11(000)*$")).unwrap(); + + for writeable in [1i32, 11, 110, 11000, 211000] { + assert_eq!( + matcher + .deref() + .find_earliest_fwd(writeable.write_to_string().as_bytes()) + .unwrap() + .is_some(), + matcher.deref().matches_earliest_fwd_lazy(&writeable) + ); + } + + struct ExitEarlyTest; + + impl writeable::Writeable for ExitEarlyTest { + fn write_to(&self, sink: &mut W) -> core::fmt::Result { + sink.write_str("12")?; + unreachable!() + } + } + + assert!(!matcher.deref().matches_earliest_fwd_lazy(&ExitEarlyTest)); +} diff --git a/vendor/icu_list/src/lib.rs b/vendor/icu_list/src/lib.rs index 18f2156a6..61aec0fa3 100644 --- a/vendor/icu_list/src/lib.rs +++ b/vendor/icu_list/src/lib.rs @@ -93,8 +93,9 @@ extern crate alloc; mod error; +mod lazy_automaton; mod list_formatter; -mod string_matcher; +mod patterns; pub mod provider; diff --git a/vendor/icu_list/src/list_formatter.rs b/vendor/icu_list/src/list_formatter.rs index 36f5fbb7b..93f035eab 100644 --- a/vendor/icu_list/src/list_formatter.rs +++ b/vendor/icu_list/src/list_formatter.rs @@ -72,8 +72,39 @@ impl ListFormatter { ); /// Returns a [`Writeable`] composed of the input [`Writeable`]s and the language-dependent - /// formatting. The first layer of parts contains [`parts::ELEMENT`] for input - /// elements, and [`parts::LITERAL`] for list literals. + /// formatting. + /// + /// The [`Writeable`] is annotated with [`parts::ELEMENT`] for input elements, + /// and [`parts::LITERAL`] for list literals. + /// + /// # Example + /// + /// ``` + /// use icu::list::*; + /// # use icu::locid::locale; + /// # use writeable::*; + /// let formatteur = ListFormatter::try_new_and_with_length_unstable( + /// &icu_testdata::unstable(), + /// &locale!("fr").into(), + /// ListLength::Wide, + /// ) + /// .unwrap(); + /// let pays = ["Italie", "France", "Espagne", "Allemagne"]; + /// + /// assert_writeable_parts_eq!( + /// formatteur.format(pays.iter()), + /// "Italie, France, Espagne et Allemagne", + /// [ + /// (0, 6, parts::ELEMENT), + /// (6, 8, parts::LITERAL), + /// (8, 14, parts::ELEMENT), + /// (14, 16, parts::LITERAL), + /// (16, 23, parts::ELEMENT), + /// (23, 27, parts::LITERAL), + /// (27, 36, parts::ELEMENT), + /// ] + /// ); + /// ``` pub fn format<'a, W: Writeable + 'a, I: Iterator + Clone + 'a>( &'a self, values: I, @@ -99,6 +130,9 @@ pub mod parts { use writeable::Part; /// The [`Part`] used by [`FormattedList`](super::FormattedList) to mark the part of the string that is an element. + /// + /// * `category`: `"list"` + /// * `value`: `"element"` pub const ELEMENT: Part = Part { category: "list", value: "element", @@ -106,6 +140,9 @@ pub mod parts { /// The [`Part`] used by [`FormattedList`](super::FormattedList) to mark the part of the string that is a list literal, /// such as ", " or " and ". + /// + /// * `category`: `"list"` + /// * `value`: `"literal"` pub const LITERAL: Part = Part { category: "list", value: "literal", @@ -234,7 +271,7 @@ mod tests { fn formatter(length: ListLength) -> ListFormatter { ListFormatter { - data: DataPayload::from_owned(crate::provider::test::test_patterns()), + data: DataPayload::from_owned(crate::patterns::test::test_patterns()), length, } } diff --git a/vendor/icu_list/src/patterns.rs b/vendor/icu_list/src/patterns.rs new file mode 100644 index 000000000..8cfcb98c1 --- /dev/null +++ b/vendor/icu_list/src/patterns.rs @@ -0,0 +1,283 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::lazy_automaton::LazyAutomaton; +use crate::provider::*; +use crate::ListLength; +#[cfg(feature = "datagen")] +use alloc::borrow::Cow; +#[cfg(feature = "datagen")] +use icu_provider::DataError; +use writeable::{LengthHint, Writeable}; + +impl<'data> ListFormatterPatternsV1<'data> { + /// Creates a new [`ListFormatterPatternsV1`] from the given patterns. Fails if any pattern is invalid. + /// + /// See [`ListJoinerPattern::from_str`]. `allow_prefix` will be true for `pair` and `end` patterns, + /// `allow_suffix` for `start` and `pair` patterns. + #[cfg(feature = "datagen")] + pub fn try_new( + [start, middle, end, pair, short_start, short_middle, short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair]: [&str; 12], + ) -> Result { + Ok(Self([ + ListJoinerPattern::from_str(start, true, false)?.into(), + ListJoinerPattern::from_str(middle, false, false)?.into(), + ListJoinerPattern::from_str(end, false, true)?.into(), + ListJoinerPattern::from_str(pair, true, true)?.into(), + ListJoinerPattern::from_str(short_start, true, false)?.into(), + ListJoinerPattern::from_str(short_middle, false, false)?.into(), + ListJoinerPattern::from_str(short_end, false, true)?.into(), + ListJoinerPattern::from_str(short_pair, true, true)?.into(), + ListJoinerPattern::from_str(narrow_start, true, false)?.into(), + ListJoinerPattern::from_str(narrow_middle, false, false)?.into(), + ListJoinerPattern::from_str(narrow_end, false, true)?.into(), + ListJoinerPattern::from_str(narrow_pair, true, true)?.into(), + ])) + } + + /// Adds a special case to all `pattern`s that will evaluate to + /// `alternative_pattern` when `regex` matches the following element. + /// The regex is interpreted case-insensitive and anchored to the beginning, but + /// to improve efficiency does not search for full matches. If a full match is + /// required, use `$`. + #[cfg(feature = "datagen")] + pub fn make_conditional( + &mut self, + pattern: &str, + regex: &SerdeDFA<'static>, + alternative_pattern: &str, + ) -> Result<(), DataError> { + let old = ListJoinerPattern::from_str(pattern, true, true)?; + for i in 0..12 { + #[allow(clippy::indexing_slicing)] // self.0 is &[_; 12] + if self.0[i].default == old { + self.0[i].special_case = Some(SpecialCasePattern { + condition: regex.clone(), + pattern: ListJoinerPattern::from_str( + alternative_pattern, + i % 4 == 0 || i % 4 == 3, // allow_prefix = start or pair + i % 4 == 2 || i % 4 == 3, // allow_suffix = end or pair + )?, + }); + } + } + Ok(()) + } + + /// The range of the number of bytes required by the list literals to join a + /// list of length `len`. If none of the patterns are conditional, this is exact. + pub(crate) fn size_hint(&self, style: ListLength, len: usize) -> LengthHint { + match len { + 0 | 1 => LengthHint::exact(0), + 2 => self.pair(style).size_hint(), + n => { + self.start(style).size_hint() + + self.middle(style).size_hint() * (n - 3) + + self.end(style).size_hint() + } + } + } +} + +type PatternParts<'a> = (&'a str, &'a str, &'a str); + +impl<'a> ConditionalListJoinerPattern<'a> { + pub(crate) fn parts<'b, W: Writeable + ?Sized>( + &'a self, + following_value: &'b W, + ) -> PatternParts<'a> { + match &self.special_case { + Some(SpecialCasePattern { condition, pattern }) + if condition.deref().matches_earliest_fwd_lazy(following_value) => + { + pattern.borrow_tuple() + } + _ => self.default.borrow_tuple(), + } + } + + /// The expected length of this pattern + fn size_hint(&'a self) -> LengthHint { + let mut hint = self.default.size_hint(); + if let Some(special_case) = &self.special_case { + hint |= special_case.pattern.size_hint() + } + hint + } +} + +impl<'data> ListJoinerPattern<'data> { + /// Construct the pattern from a CLDR pattern string + #[cfg(feature = "datagen")] + pub fn from_str( + pattern: &str, + allow_prefix: bool, + allow_suffix: bool, + ) -> Result { + match (pattern.find("{0}"), pattern.find("{1}")) { + (Some(index_0), Some(index_1)) + if index_0 < index_1 + && (allow_prefix || index_0 == 0) + && (allow_suffix || index_1 == pattern.len() - 3) => + { + if (index_0 > 0 && !cfg!(test)) || index_1 - 3 >= 256 { + return Err(DataError::custom( + "Found valid pattern that cannot be stored in ListFormatterPatternsV1", + ) + .with_debug_context(pattern)); + } + #[allow(clippy::indexing_slicing)] // find + Ok(ListJoinerPattern { + string: Cow::Owned(alloc::format!( + "{}{}{}", + &pattern[0..index_0], + &pattern[index_0 + 3..index_1], + &pattern[index_1 + 3..] + )), + index_0: index_0 as u8, + index_1: (index_1 - 3) as u8, + }) + } + _ => Err(DataError::custom("Invalid list pattern").with_debug_context(pattern)), + } + } + + fn borrow_tuple(&'data self) -> PatternParts<'data> { + #![allow(clippy::indexing_slicing)] // by invariant + let index_0 = self.index_0 as usize; + let index_1 = self.index_1 as usize; + ( + &self.string[0..index_0], + &self.string[index_0..index_1], + &self.string[index_1..], + ) + } + + fn size_hint(&self) -> LengthHint { + LengthHint::exact(self.string.len()) + } +} + +#[cfg(feature = "datagen")] +impl<'data> From> for ConditionalListJoinerPattern<'data> { + fn from(default: ListJoinerPattern<'data>) -> Self { + Self { + default, + special_case: None, + } + } +} + +#[cfg(all(test, feature = "datagen"))] +pub mod test { + use super::*; + + pub fn test_patterns() -> ListFormatterPatternsV1<'static> { + let mut patterns = ListFormatterPatternsV1::try_new([ + // Wide: general + "@{0}:{1}", + "{0},{1}", + "{0}.{1}!", + "${0};{1}+", + // Short: different pattern lengths + "{0}1{1}", + "{0}12{1}", + "{0}12{1}34", + "{0}123{1}456", + // Narrow: conditionals + "{0}: {1}", + "{0}, {1}", + "{0}. {1}", + "{0}. {1}", + ]) + .unwrap(); + patterns + .make_conditional( + "{0}. {1}", + &SerdeDFA::new(Cow::Borrowed("A")).unwrap(), + "{0} :o {1}", + ) + .unwrap(); + patterns + } + + #[test] + fn rejects_bad_patterns() { + assert!(ListJoinerPattern::from_str("{0} and", true, true).is_err()); + assert!(ListJoinerPattern::from_str("and {1}", true, true).is_err()); + assert!(ListJoinerPattern::from_str("{1} and {0}", true, true).is_err()); + assert!(ListJoinerPattern::from_str("{1{0}}", true, true).is_err()); + assert!(ListJoinerPattern::from_str("{0\u{202e}} and {1}", true, true).is_err()); + assert!(ListJoinerPattern::from_str("{{0}} {{1}}", true, true).is_ok()); + + assert!(ListJoinerPattern::from_str("{0} and {1} ", true, true).is_ok()); + assert!(ListJoinerPattern::from_str("{0} and {1} ", true, false).is_err()); + assert!(ListJoinerPattern::from_str(" {0} and {1}", true, true).is_ok()); + assert!(ListJoinerPattern::from_str(" {0} and {1}", false, true).is_err()); + } + + #[test] + fn produces_correct_parts() { + assert_eq!( + test_patterns().pair(ListLength::Wide).parts(""), + ("$", ";", "+") + ); + } + + #[test] + fn produces_correct_parts_conditionally() { + assert_eq!( + test_patterns().end(ListLength::Narrow).parts("A"), + ("", " :o ", "") + ); + assert_eq!( + test_patterns().end(ListLength::Narrow).parts("a"), + ("", " :o ", "") + ); + assert_eq!( + test_patterns().end(ListLength::Narrow).parts("ab"), + ("", " :o ", "") + ); + assert_eq!( + test_patterns().end(ListLength::Narrow).parts("B"), + ("", ". ", "") + ); + assert_eq!( + test_patterns().end(ListLength::Narrow).parts("BA"), + ("", ". ", "") + ); + } + + #[test] + fn size_hint_works() { + let pattern = test_patterns(); + + assert_eq!( + pattern.size_hint(ListLength::Short, 0), + LengthHint::exact(0) + ); + assert_eq!( + pattern.size_hint(ListLength::Short, 1), + LengthHint::exact(0) + ); + + // pair pattern "{0}123{1}456" + assert_eq!( + pattern.size_hint(ListLength::Short, 2), + LengthHint::exact(6) + ); + + // patterns "{0}1{1}", "{0}12{1}" (x197), and "{0}12{1}34" + assert_eq!( + pattern.size_hint(ListLength::Short, 200), + LengthHint::exact(1 + 2 * 197 + 4) + ); + + // patterns "{0}: {1}", "{0}, {1}" (x197), and "{0} :o {1}" or "{0}. {1}" + assert_eq!( + pattern.size_hint(ListLength::Narrow, 200), + LengthHint::exact(2 + 197 * 2) + LengthHint::between(2, 4) + ); + } +} diff --git a/vendor/icu_list/src/provider.rs b/vendor/icu_list/src/provider.rs deleted file mode 100644 index 27f3e4fec..000000000 --- a/vendor/icu_list/src/provider.rs +++ /dev/null @@ -1,465 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -// Provider structs must be stable -#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] - -//! Data provider struct definitions for this ICU4X component. -//! -//! Read more about data providers: [`icu_provider`] - -use crate::ListLength; -use alloc::borrow::Cow; -use icu_provider::DataMarker; -use icu_provider::{yoke, zerofrom}; -use writeable::{LengthHint, Writeable}; - -pub use crate::string_matcher::StringMatcher; - -/// Symbols and metadata required for [`ListFormatter`](crate::ListFormatter). -#[icu_provider::data_struct( - AndListV1Marker = "list/and@1", - OrListV1Marker = "list/or@1", - UnitListV1Marker = "list/unit@1" -)] -#[derive(Clone, Debug)] -#[cfg_attr( - feature = "datagen", - derive(serde::Serialize, databake::Bake), - databake(path = icu_list::provider), -)] -pub struct ListFormatterPatternsV1<'data>( - #[cfg_attr(feature = "datagen", serde(with = "deduplicating_array"))] - /// The patterns in the order start, middle, end, pair, short_start, short_middle, - /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair, - pub [ConditionalListJoinerPattern<'data>; 12], -); - -#[cfg(feature = "serde")] -impl<'de> serde::Deserialize<'de> for ListFormatterPatternsV1<'de> { - fn deserialize(deserializer: D) -> Result - where - D: serde::de::Deserializer<'de>, - { - #[cfg(not(feature = "serde_human"))] - if deserializer.is_human_readable() { - use serde::de::Error; - return Err(D::Error::custom( - "Deserializing human-readable ListFormatter data requires the 'serde_human' feature", - )); - } - - Ok(ListFormatterPatternsV1(deduplicating_array::deserialize( - deserializer, - )?)) - } -} - -pub(crate) struct ErasedListV1Marker; - -impl DataMarker for ErasedListV1Marker { - type Yokeable = ListFormatterPatternsV1<'static>; -} - -impl<'data> ListFormatterPatternsV1<'data> { - pub(crate) fn start(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { - #![allow(clippy::indexing_slicing)] // style as usize < 3 - &self.0[4 * (style as usize)] - } - - pub(crate) fn middle(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { - #![allow(clippy::indexing_slicing)] // style as usize < 3 - &self.0[4 * (style as usize) + 1] - } - - pub(crate) fn end(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { - #![allow(clippy::indexing_slicing)] // style as usize < 3 - &self.0[4 * (style as usize) + 2] - } - - pub(crate) fn pair(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { - #![allow(clippy::indexing_slicing)] // style as usize < 3 - &self.0[4 * (style as usize) + 3] - } - - /// The range of the number of bytes required by the list literals to join a - /// list of length `len`. If none of the patterns are conditional, this is exact. - pub(crate) fn size_hint(&self, style: ListLength, len: usize) -> LengthHint { - match len { - 0 | 1 => LengthHint::exact(0), - 2 => self.pair(style).size_hint(), - n => { - self.start(style).size_hint() - + self.middle(style).size_hint() * (n - 3) - + self.end(style).size_hint() - } - } - } -} - -/// A pattern that can behave conditionally on the next element. -#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] -#[cfg_attr( - feature = "datagen", - derive(serde::Serialize, databake::Bake), - databake(path = icu_list::provider), -)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize))] -pub struct ConditionalListJoinerPattern<'data> { - /// The default pattern - #[cfg_attr(feature = "serde", serde(borrow))] - pub default: ListJoinerPattern<'data>, - /// And optional special case - #[cfg_attr(feature = "serde", serde(borrow))] - pub special_case: Option>, -} - -/// The special case of a [`ConditionalListJoinerPattern`] -#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] -#[cfg_attr( - feature = "datagen", - derive(serde::Serialize, databake::Bake), - databake(path = icu_list::provider), -)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize))] -pub struct SpecialCasePattern<'data> { - /// The condition on the following element - #[cfg_attr(feature = "serde", serde(borrow))] - pub condition: StringMatcher<'data>, - /// The pattern if the condition matches - #[cfg_attr(feature = "serde", serde(borrow))] - pub pattern: ListJoinerPattern<'data>, -} - -/// A pattern containing two numeric placeholders ("{0}, and {1}.") -#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] -#[cfg_attr(feature = "datagen", derive(serde::Serialize))] -pub struct ListJoinerPattern<'data> { - /// The pattern string without the placeholders - string: Cow<'data, str>, - /// The index of the first placeholder. Always <= index_1. - // Always 0 for CLDR data, so we don't need to serialize it. - // In-memory we have free space for it as index_1 doesn't - // fill a word. - #[cfg_attr(feature = "datagen", serde(skip))] - index_0: u8, - /// The index of the second placeholder. Always < string.len(). - index_1: u8, -} - -#[cfg(feature = "serde")] -impl<'de: 'data, 'data> serde::Deserialize<'de> for ListJoinerPattern<'data> { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - #[derive(serde::Deserialize)] - struct Dummy<'data> { - #[cfg_attr(feature = "serde", serde(borrow))] - string: Cow<'data, str>, - index_1: u8, - } - let Dummy { string, index_1 } = Dummy::deserialize(deserializer)?; - - if index_1 as usize > string.len() { - use serde::de::Error; - Err(D::Error::custom("invalid index_1")) - } else { - Ok(ListJoinerPattern { - string, - index_0: 0, - index_1, - }) - } - } -} - -impl<'a> ListJoinerPattern<'a> { - /// Constructs a [`ListJoinerPattern`] from raw parts. Used by databake. - /// - /// # Safety - /// index_1 may be at most string.len() - pub const unsafe fn from_parts_unchecked(string: &'a str, index_1: u8) -> Self { - Self { - string: Cow::Borrowed(string), - index_0: 0, - index_1, - } - } -} - -pub(crate) type PatternParts<'a> = (&'a str, &'a str, &'a str); - -impl<'a> ConditionalListJoinerPattern<'a> { - pub(crate) fn parts<'b, W: Writeable + ?Sized>( - &'a self, - following_value: &'b W, - ) -> PatternParts<'a> { - match &self.special_case { - Some(SpecialCasePattern { condition, pattern }) - // TODO: Implement lookahead instead of materializing here. - if condition.test(&*following_value.write_to_string()) => - { - pattern.borrow_tuple() - } - _ => self.default.borrow_tuple(), - } - } - - /// The expected length of this pattern - pub fn size_hint(&'a self) -> LengthHint { - let mut hint = self.default.size_hint(); - if let Some(special_case) = &self.special_case { - hint |= special_case.pattern.size_hint() - } - hint - } -} - -impl<'data> ListJoinerPattern<'data> { - fn borrow_tuple(&'data self) -> PatternParts<'data> { - #![allow(clippy::indexing_slicing)] // by invariant - let index_0 = self.index_0 as usize; - let index_1 = self.index_1 as usize; - ( - &self.string[0..index_0], - &self.string[index_0..index_1], - &self.string[index_1..], - ) - } - - fn size_hint(&self) -> LengthHint { - LengthHint::exact(self.string.len()) - } -} - -#[cfg(feature = "datagen")] -mod datagen { - #![allow(clippy::indexing_slicing)] // datagen - - use super::*; - use icu_provider::DataError; - - impl<'data> ListFormatterPatternsV1<'data> { - /// The patterns in the order start, middle, end, pair, short_start, short_middle, - /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair, - pub fn try_new(patterns: [&str; 12]) -> Result { - Ok(Self([ - ListJoinerPattern::from_str(patterns[0], true, false)?.into(), - ListJoinerPattern::from_str(patterns[1], false, false)?.into(), - ListJoinerPattern::from_str(patterns[2], false, true)?.into(), - ListJoinerPattern::from_str(patterns[3], true, true)?.into(), - ListJoinerPattern::from_str(patterns[4], true, false)?.into(), - ListJoinerPattern::from_str(patterns[5], false, false)?.into(), - ListJoinerPattern::from_str(patterns[6], false, true)?.into(), - ListJoinerPattern::from_str(patterns[7], true, true)?.into(), - ListJoinerPattern::from_str(patterns[8], true, false)?.into(), - ListJoinerPattern::from_str(patterns[9], false, false)?.into(), - ListJoinerPattern::from_str(patterns[10], false, true)?.into(), - ListJoinerPattern::from_str(patterns[11], true, true)?.into(), - ])) - } - - /// Adds a special case to all `pattern`s that will evaluate to - /// `alternative_pattern` when `regex` matches the following element. - /// The regex is interpreted case-insensitive and anchored to the beginning, but - /// to improve efficiency does not search for full matches. If a full match is - /// required, use `$`. - pub fn make_conditional( - &mut self, - pattern: &str, - regex: &StringMatcher<'static>, - alternative_pattern: &str, - ) -> Result<(), DataError> { - let old = ListJoinerPattern::from_str(pattern, true, true)?; - for i in 0..12 { - if self.0[i].default == old { - self.0[i].special_case = Some(SpecialCasePattern { - condition: regex.clone(), - pattern: ListJoinerPattern::from_str( - alternative_pattern, - i % 4 == 0 || i % 4 == 3, // allow_prefix = start or pair - i % 4 == 2 || i % 4 == 3, // allow_suffix = end or pair - )?, - }); - } - } - Ok(()) - } - } - - impl<'data> ListJoinerPattern<'data> { - /// Construct the pattern from a CLDR pattern string - pub fn from_str( - pattern: &str, - allow_prefix: bool, - allow_suffix: bool, - ) -> Result { - match (pattern.find("{0}"), pattern.find("{1}")) { - (Some(index_0), Some(index_1)) - if index_0 < index_1 - && (allow_prefix || index_0 == 0) - && (allow_suffix || index_1 == pattern.len() - 3) => - { - if (index_0 > 0 && !cfg!(test)) || index_1 - 3 >= 256 { - return Err(DataError::custom( - "Found valid pattern that cannot be stored in ListFormatterPatternsV1", - ) - .with_debug_context(pattern)); - } - Ok(ListJoinerPattern { - string: Cow::Owned(alloc::format!( - "{}{}{}", - &pattern[0..index_0], - &pattern[index_0 + 3..index_1], - &pattern[index_1 + 3..] - )), - index_0: index_0 as u8, - index_1: (index_1 - 3) as u8, - }) - } - _ => Err(DataError::custom("Invalid list pattern").with_debug_context(pattern)), - } - } - } - - impl<'data> From> for ConditionalListJoinerPattern<'data> { - fn from(default: ListJoinerPattern<'data>) -> Self { - Self { - default, - special_case: None, - } - } - } - - impl databake::Bake for ListJoinerPattern<'_> { - fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { - env.insert("icu_list"); - let string = (&*self.string).bake(env); - let index_1 = self.index_1.bake(env); - // Safe because our own data is safe - databake::quote! { unsafe { - ::icu_list::provider::ListJoinerPattern::from_parts_unchecked(#string, #index_1) - }} - } - } -} - -#[cfg(all(test, feature = "datagen"))] -pub(crate) mod test { - use super::*; - - pub fn test_patterns() -> ListFormatterPatternsV1<'static> { - let mut patterns = ListFormatterPatternsV1::try_new([ - // Wide: general - "@{0}:{1}", - "{0},{1}", - "{0}.{1}!", - "${0};{1}+", - // Short: different pattern lengths - "{0}1{1}", - "{0}12{1}", - "{0}12{1}34", - "{0}123{1}456", - // Narrow: conditionals - "{0}: {1}", - "{0}, {1}", - "{0}. {1}", - "{0}. {1}", - ]) - .unwrap(); - patterns - .make_conditional("{0}. {1}", &StringMatcher::new("A").unwrap(), "{0} :o {1}") - .unwrap(); - patterns - } - - #[test] - fn rejects_bad_patterns() { - assert!(ListJoinerPattern::from_str("{0} and", true, true).is_err()); - assert!(ListJoinerPattern::from_str("and {1}", true, true).is_err()); - assert!(ListJoinerPattern::from_str("{1} and {0}", true, true).is_err()); - assert!(ListJoinerPattern::from_str("{1{0}}", true, true).is_err()); - assert!(ListJoinerPattern::from_str("{0\u{202e}} and {1}", true, true).is_err()); - assert!(ListJoinerPattern::from_str("{{0}} {{1}}", true, true).is_ok()); - - assert!(ListJoinerPattern::from_str("{0} and {1} ", true, true).is_ok()); - assert!(ListJoinerPattern::from_str("{0} and {1} ", true, false).is_err()); - assert!(ListJoinerPattern::from_str(" {0} and {1}", true, true).is_ok()); - assert!(ListJoinerPattern::from_str(" {0} and {1}", false, true).is_err()); - } - - #[test] - fn produces_correct_parts() { - assert_eq!( - test_patterns().pair(ListLength::Wide).parts(""), - ("$", ";", "+") - ); - } - - #[test] - fn produces_correct_parts_conditionally() { - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("A"), - ("", " :o ", "") - ); - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("a"), - ("", " :o ", "") - ); - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("ab"), - ("", " :o ", "") - ); - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("B"), - ("", ". ", "") - ); - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("BA"), - ("", ". ", "") - ); - } - - #[test] - fn size_hint_works() { - let pattern = test_patterns(); - - assert_eq!( - pattern.size_hint(ListLength::Short, 0), - LengthHint::exact(0) - ); - assert_eq!( - pattern.size_hint(ListLength::Short, 1), - LengthHint::exact(0) - ); - - // pair pattern "{0}123{1}456" - assert_eq!( - pattern.size_hint(ListLength::Short, 2), - LengthHint::exact(6) - ); - - // patterns "{0}1{1}", "{0}12{1}" (x197), and "{0}12{1}34" - assert_eq!( - pattern.size_hint(ListLength::Short, 200), - LengthHint::exact(1 + 2 * 197 + 4) - ); - - // patterns "{0}: {1}", "{0}, {1}" (x197), and "{0} :o {1}" or "{0}. {1}" - assert_eq!( - pattern.size_hint(ListLength::Narrow, 200), - LengthHint::exact(2 + 197 * 2) + LengthHint::between(2, 4) - ); - } - - #[test] - fn databake() { - databake::test_bake!( - ListJoinerPattern, - const: unsafe { crate::provider::ListJoinerPattern::from_parts_unchecked(", ", 2u8) }, - icu_list - ); - } -} diff --git a/vendor/icu_list/src/provider/mod.rs b/vendor/icu_list/src/provider/mod.rs new file mode 100644 index 000000000..efab7c8bc --- /dev/null +++ b/vendor/icu_list/src/provider/mod.rs @@ -0,0 +1,261 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// Provider structs must be stable +#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] + +//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. +//! +//!
+//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +//! including in SemVer minor releases. While the serde representation of data structs is guaranteed +//! to be stable, their Rust representation might not be. Use with caution. +//!
+//! +//! Read more about data providers: [`icu_provider`] + +use crate::ListLength; +use alloc::borrow::Cow; +use icu_provider::DataMarker; +use icu_provider::{yoke, zerofrom}; + +mod serde_dfa; +pub use serde_dfa::SerdeDFA; + +/// Symbols and metadata required for [`ListFormatter`](crate::ListFormatter). +/// +///
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +///
+#[icu_provider::data_struct( + AndListV1Marker = "list/and@1", + OrListV1Marker = "list/or@1", + UnitListV1Marker = "list/unit@1" +)] +#[derive(Clone, Debug)] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_list::provider), +)] +pub struct ListFormatterPatternsV1<'data>( + #[cfg_attr(feature = "datagen", serde(with = "deduplicating_array"))] + /// The patterns in the order start, middle, end, pair, short_start, short_middle, + /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair, + pub [ConditionalListJoinerPattern<'data>; 12], +); + +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for ListFormatterPatternsV1<'de> { + fn deserialize(deserializer: D) -> Result + where + D: serde::de::Deserializer<'de>, + { + #[cfg(not(feature = "serde_human"))] + if deserializer.is_human_readable() { + use serde::de::Error; + return Err(D::Error::custom( + "Deserializing human-readable ListFormatter data requires the 'serde_human' feature", + )); + } + + Ok(ListFormatterPatternsV1(deduplicating_array::deserialize( + deserializer, + )?)) + } +} + +pub(crate) struct ErasedListV1Marker; + +impl DataMarker for ErasedListV1Marker { + type Yokeable = ListFormatterPatternsV1<'static>; +} + +impl<'data> ListFormatterPatternsV1<'data> { + pub(crate) fn start(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { + #![allow(clippy::indexing_slicing)] // style as usize < 3 + &self.0[4 * (style as usize)] + } + + pub(crate) fn middle(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { + #![allow(clippy::indexing_slicing)] // style as usize < 3 + &self.0[4 * (style as usize) + 1] + } + + pub(crate) fn end(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { + #![allow(clippy::indexing_slicing)] // style as usize < 3 + &self.0[4 * (style as usize) + 2] + } + + pub(crate) fn pair(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { + #![allow(clippy::indexing_slicing)] // style as usize < 3 + &self.0[4 * (style as usize) + 3] + } +} + +/// A pattern that can behave conditionally on the next element. +/// +///
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +///
+#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr( + feature = "datagen", + derive(PartialEq, serde::Serialize, databake::Bake), + databake(path = icu_list::provider), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +pub struct ConditionalListJoinerPattern<'data> { + /// The default pattern + #[cfg_attr(feature = "serde", serde(borrow))] + pub default: ListJoinerPattern<'data>, + /// And optional special case + #[cfg_attr( + feature = "serde", + serde(borrow, deserialize_with = "SpecialCasePattern::deserialize_option") + )] + pub special_case: Option>, +} + +/// The special case of a [`ConditionalListJoinerPattern`] +/// +///
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +///
+#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr( + feature = "datagen", + derive(PartialEq, serde::Serialize, databake::Bake), + databake(path = icu_list::provider), +)] +pub struct SpecialCasePattern<'data> { + /// The condition on the following element + pub condition: SerdeDFA<'data>, + /// The pattern if the condition matches + pub pattern: ListJoinerPattern<'data>, +} + +#[cfg(feature = "serde")] +impl<'data> SpecialCasePattern<'data> { + // If the condition doesn't deserialize, the whole special case becomes `None` + fn deserialize_option<'de: 'data, D>(deserializer: D) -> Result, D::Error> + where + D: serde::de::Deserializer<'de>, + { + use serde::Deserialize; + + #[derive(Deserialize)] + struct SpecialCasePatternOptionalDfa<'data> { + #[cfg_attr( + feature = "serde", + serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize") + )] + pub condition: Option>, + #[cfg_attr(feature = "serde", serde(borrow))] + pub pattern: ListJoinerPattern<'data>, + } + + Ok( + match Option::>::deserialize(deserializer)? { + Some(SpecialCasePatternOptionalDfa { + condition: Some(condition), + pattern, + }) => Some(SpecialCasePattern { condition, pattern }), + _ => None, + }, + ) + } +} + +/// A pattern containing two numeric placeholders ("{0}, and {1}.") +/// +///
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +///
+#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr(feature = "datagen", derive(serde::Serialize))] +pub struct ListJoinerPattern<'data> { + /// The pattern string without the placeholders + pub(crate) string: Cow<'data, str>, + /// The index of the first placeholder. Always <= index_1. + // Always 0 for CLDR data, so we don't need to serialize it. + // In-memory we have free space for it as index_1 doesn't + // fill a word. + #[cfg_attr(feature = "datagen", serde(skip))] + pub(crate) index_0: u8, + /// The index of the second placeholder. Always < string.len(). + pub(crate) index_1: u8, +} + +#[cfg(feature = "serde")] +impl<'de: 'data, 'data> serde::Deserialize<'de> for ListJoinerPattern<'data> { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + #[derive(serde::Deserialize)] + struct Dummy<'data> { + #[cfg_attr(feature = "serde", serde(borrow))] + string: Cow<'data, str>, + index_1: u8, + } + let Dummy { string, index_1 } = Dummy::deserialize(deserializer)?; + + if index_1 as usize > string.len() { + use serde::de::Error; + Err(D::Error::custom("invalid index_1")) + } else { + Ok(ListJoinerPattern { + string, + index_0: 0, + index_1, + }) + } + } +} + +impl<'a> ListJoinerPattern<'a> { + /// Constructs a [`ListJoinerPattern`] from raw parts. Used by databake. + /// + /// # Safety + /// index_1 may be at most string.len() + pub const unsafe fn from_parts_unchecked(string: &'a str, index_1: u8) -> Self { + Self { + string: Cow::Borrowed(string), + index_0: 0, + index_1, + } + } +} + +#[cfg(feature = "datagen")] +impl databake::Bake for ListJoinerPattern<'_> { + fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { + env.insert("icu_list"); + let string = (&*self.string).bake(env); + let index_1 = self.index_1.bake(env); + // Safe because our own data is safe + databake::quote! { unsafe { + ::icu_list::provider::ListJoinerPattern::from_parts_unchecked(#string, #index_1) + }} + } +} + +#[cfg(all(test, feature = "datagen"))] +#[test] +fn databake() { + databake::test_bake!( + ListJoinerPattern, + const: unsafe { crate::provider::ListJoinerPattern::from_parts_unchecked(", ", 2u8) }, + icu_list + ); +} diff --git a/vendor/icu_list/src/provider/serde_dfa.rs b/vendor/icu_list/src/provider/serde_dfa.rs new file mode 100644 index 000000000..e2424e1e9 --- /dev/null +++ b/vendor/icu_list/src/provider/serde_dfa.rs @@ -0,0 +1,244 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use alloc::borrow::Cow; +use icu_provider::{yoke, zerofrom}; +use regex_automata::dfa::sparse::DFA; + +/// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement +/// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian +/// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option`. +/// +///
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +///
+#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] +pub struct SerdeDFA<'data> { + // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok()) + dfa_bytes: Cow<'data, [u8]>, + pattern: Option>, +} + +#[cfg(feature = "datagen")] +impl PartialEq for SerdeDFA<'_> { + fn eq(&self, other: &Self) -> bool { + self.dfa_bytes == other.dfa_bytes + } +} + +#[cfg(feature = "datagen")] +impl databake::Bake for SerdeDFA<'_> { + fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { + env.insert("icu_list"); + let le_bytes = self.deref().to_bytes_little_endian().as_slice().bake(env); + let be_bytes = self.deref().to_bytes_big_endian().as_slice().bake(env); + // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant. + databake::quote! { + unsafe { + ::icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked( + if cfg!(target_endian = "little") { + &#le_bytes + } else { + &#be_bytes + } + ) + } + } + } +} + +#[cfg(feature = "datagen")] +impl serde::Serialize for SerdeDFA<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::ser::Serializer, + { + if serializer.is_human_readable() { + self.pattern + .as_ref() + .map(|pattern| pattern.serialize(serializer)) + .unwrap_or_else(|| { + use serde::ser::Error; + Err(S::Error::custom( + "cannot serialize a deserialized bincode SerdeDFA to JSON", + )) + }) + } else { + self.deref().to_bytes_little_endian().serialize(serializer) + } + } +} + +#[cfg(feature = "serde")] +impl<'data> SerdeDFA<'data> { + /// Deserializes to `Option`. Will return `None` for non-human-readable serialization + /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive. + pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result, D::Error> + where + D: serde::de::Deserializer<'de>, + { + use icu_provider::serde::borrow_de_utils::CowBytesWrap; + use serde::Deserialize; + + #[cfg(feature = "serde_human")] + if deserializer.is_human_readable() { + #[cfg(not(feature = "std"))] + use alloc::string::ToString; + use serde::de::Error; + return SerdeDFA::new(Cow::::deserialize(deserializer)?) + .map(Some) + .map_err(|e| D::Error::custom(e.to_string())); + } + + let dfa_bytes = >::deserialize(deserializer)?.0; + + if cfg!(target_endian = "big") { + return Ok(None); + } + + // Verify safety invariant + DFA::from_bytes(&dfa_bytes).map_err(|e| { + use serde::de::Error; + D::Error::custom(alloc::format!("Invalid DFA bytes: {}", e)) + })?; + + Ok(Some(SerdeDFA { + dfa_bytes, + pattern: None, + })) + } +} + +impl<'data> SerdeDFA<'data> { + /// Creates a `SerdeDFA` from raw bytes. Used internally by databake. + /// + /// # Safety + /// + /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok()) + pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self { + Self { + dfa_bytes: Cow::Borrowed(dfa_bytes), + pattern: None, + } + } + + /// Creates a `SerdeDFA` from a regex. + #[cfg(any(feature = "datagen", feature = "serde_human",))] + pub fn new(pattern: Cow<'data, str>) -> Result { + use regex_automata::{ + dfa::dense::{Builder, Config}, + SyntaxConfig, + }; + + let mut builder = Builder::new(); + let dfa = builder + .syntax(SyntaxConfig::new().case_insensitive(true)) + .configure(Config::new().anchored(true).minimize(true)) + .build(&pattern) + .map_err(|_| { + icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern) + })? + .to_sparse() + .map_err(|_| { + icu_provider::DataError::custom("Cannot sparsify DFA") + .with_display_context(&pattern) + })?; + + Ok(Self { + dfa_bytes: dfa.to_bytes_native_endian().into(), + pattern: Some(pattern), + }) + } + + /// Returns the represented [`DFA`] + #[allow(clippy::unwrap_used)] // by invariant + pub fn deref(&'data self) -> DFA<&'data [u8]> { + // Safe due to struct invariant. + unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 } + } +} + +#[cfg(all(test, feature = "datagen"))] +mod test { + use super::*; + + #[test] + fn test_serde_dfa() { + use regex_automata::dfa::Automaton; + + let matcher = SerdeDFA::new(Cow::Borrowed("abc")).unwrap(); + + assert!(matcher.deref().find_earliest_fwd(b"ab").unwrap().is_none()); + assert!(matcher.deref().find_earliest_fwd(b"abc").unwrap().is_some()); + assert!(matcher + .deref() + .find_earliest_fwd(b"abcde") + .unwrap() + .is_some()); + assert!(matcher + .deref() + .find_earliest_fwd(b" abcde") + .unwrap() + .is_none()); + } + + #[derive(serde::Deserialize)] + struct OptionSerdeDFA<'data>( + #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")] Option>, + ); + + #[test] + #[cfg(target_endian = "little")] + fn test_postcard_serialization() { + let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap(); + + let mut bytes = postcard::to_stdvec(&matcher).unwrap(); + assert_eq!( + postcard::from_bytes::(&bytes).unwrap().0, + Some(matcher) + ); + + // A corrupted byte leads to an error + bytes[17] ^= 255; + assert!(postcard::from_bytes::(&bytes).is_err()); + bytes[17] ^= 255; + + // An extra byte leads to an error + bytes.insert(123, 40); + assert!(postcard::from_bytes::(&bytes).is_err()); + bytes.remove(123); + + // Missing bytes lead to an error + assert!(postcard::from_bytes::(&bytes[0..bytes.len() - 5]).is_err()); + } + + #[test] + #[cfg(feature = "serde_human")] + fn test_json_serialization() { + let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap(); + + let json = serde_json::to_string(&matcher).unwrap(); + assert_eq!( + serde_json::from_str::(&json).unwrap().0, + Some(matcher) + ); + assert!(serde_json::from_str::(".*[").is_err()); + } + + #[test] + #[ignore] // https://github.com/rust-lang/rust/issues/98906 + fn databake() { + databake::test_bake!( + SerdeDFA, + const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little") { + &[1] // TODO: set this when activating the test + } else { + &[2] // TODO: set this when activating the test + })}, + icu_list + ); + } +} diff --git a/vendor/icu_list/src/string_matcher.rs b/vendor/icu_list/src/string_matcher.rs deleted file mode 100644 index ba4833605..000000000 --- a/vendor/icu_list/src/string_matcher.rs +++ /dev/null @@ -1,213 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use alloc::borrow::Cow; -#[cfg(any(feature = "serde_human", feature = "datagen"))] -use alloc::string::ToString; -use icu_provider::{yoke, zerofrom}; -use regex_automata::dfa::sparse::DFA; -use regex_automata::dfa::Automaton; - -/// A precompiled regex -#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] -#[allow(clippy::exhaustive_structs)] // not a public API -pub struct StringMatcher<'data> { - // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok()) - dfa_bytes: Cow<'data, [u8]>, - pattern: Option>, -} - -impl PartialEq for StringMatcher<'_> { - fn eq(&self, other: &Self) -> bool { - self.dfa_bytes == other.dfa_bytes - } -} - -#[cfg(feature = "datagen")] -impl databake::Bake for StringMatcher<'_> { - fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { - env.insert("icu_list"); - let bytes = (&&*self.dfa_bytes).bake(env); - // Safe because our own data is safe - databake::quote! { - unsafe { ::icu_list::provider::StringMatcher::from_dfa_bytes_unchecked(#bytes) } - } - } -} - -#[cfg(feature = "datagen")] -impl serde::Serialize for StringMatcher<'_> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::ser::Serializer, - { - if serializer.is_human_readable() { - self.pattern - .as_ref() - .map(|pattern| pattern.serialize(serializer)) - .unwrap_or_else(|| { - use serde::ser::Error; - Err(S::Error::custom( - "cannot serialize a deserialized bincode StringMatcher to JSON", - )) - }) - } else { - self.dfa_bytes.serialize(serializer) - } - } -} - -#[cfg(feature = "serde")] -impl<'de: 'data, 'data> serde::Deserialize<'de> for StringMatcher<'data> { - fn deserialize(deserializer: D) -> Result - where - D: serde::de::Deserializer<'de>, - { - use icu_provider::serde::borrow_de_utils::CowBytesWrap; - - #[cfg(feature = "serde_human")] - if deserializer.is_human_readable() { - use serde::de::Error; - return StringMatcher::new(<&str>::deserialize(deserializer)?) - .map_err(|e| D::Error::custom(e.to_string())); - } - - if cfg!(target_endian = "big") { - // TODO: Convert LE to BE. For now we just behave like the - // accept-nothing DFA on BE systems. - return Ok(StringMatcher { - dfa_bytes: Cow::Borrowed(&[]), - pattern: None, - }); - } - - let dfa_bytes = >::deserialize(deserializer)?.0; - - // Verify safety invariant - DFA::from_bytes(&dfa_bytes).map_err(|e| { - use serde::de::Error; - D::Error::custom(alloc::format!("Invalid DFA bytes: {}", e)) - })?; - - Ok(StringMatcher { - dfa_bytes, - pattern: None, - }) - } -} - -impl<'data> StringMatcher<'data> { - /// Creates a `StringMatcher` from a serialized DFA. Used internally by databake. - /// - /// # Safety - /// - /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok()) - pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self { - Self { - dfa_bytes: Cow::Borrowed(dfa_bytes), - pattern: None, - } - } - - /// Creates a `StringMatcher` from regex. - #[cfg(any(feature = "datagen", feature = "serde_human",))] - pub fn new(pattern: &str) -> Result { - use regex_automata::{ - dfa::dense::{Builder, Config}, - SyntaxConfig, - }; - - let mut builder = Builder::new(); - let dfa = builder - .syntax(SyntaxConfig::new().case_insensitive(true)) - .configure(Config::new().anchored(true).minimize(true)) - .build(pattern) - .map_err(|_| { - icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern) - })? - .to_sparse() - .map_err(|_| { - icu_provider::DataError::custom("Cannot sparsify DFA") - .with_display_context(&pattern) - })?; - - Ok(Self { - dfa_bytes: dfa.to_bytes_little_endian().into(), - pattern: Some(pattern.to_string().into()), - }) - } - - #[allow(clippy::unwrap_used)] // by invariant - pub(crate) fn test(&self, string: &str) -> bool { - cfg!(target_endian = "little") - && matches!( - // Safe due to struct invariant. - unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 } - .find_earliest_fwd(string.as_bytes()), - Ok(Some(_)) - ) - } -} - -#[cfg(all(test, feature = "datagen"))] -mod test { - use super::*; - - #[test] - fn test_string_matcher() { - let matcher = StringMatcher::new("abc.*").unwrap(); - assert!(!matcher.test("ab")); - assert!(matcher.test("abc")); - assert!(matcher.test("abcde")); - } - - #[test] - fn test_postcard_serialization() { - let matcher = StringMatcher::new("abc*").unwrap(); - - let mut bytes = postcard::to_stdvec(&matcher).unwrap(); - assert_eq!( - postcard::from_bytes::(&bytes).unwrap(), - matcher - ); - - // A corrupted byte leads to an error - bytes[17] ^= 255; - assert!(postcard::from_bytes::(&bytes).is_err()); - bytes[17] ^= 255; - - // An extra byte leads to an error - bytes.insert(123, 40); - assert!(postcard::from_bytes::(&bytes).is_err()); - bytes.remove(123); - - // Missing bytes lead to an error - assert!(postcard::from_bytes::(&bytes[0..bytes.len() - 5]).is_err()); - } - - #[test] - #[cfg(feature = "serde_human")] - fn test_json_serialization() { - let matcher = StringMatcher::new("abc*").unwrap(); - - let json = serde_json::to_string(&matcher).unwrap(); - assert_eq!( - serde_json::from_str::(&json).unwrap(), - matcher - ); - assert!(serde_json::from_str::(".*[").is_err()); - } - - #[test] - #[ignore] // https://github.com/rust-lang/rust/issues/98906 - fn databake() { - databake::test_bake!( - StringMatcher, - const: unsafe { - crate::provider::StringMatcher::from_dfa_bytes_unchecked(&[49u8, 50u8, 51u8, ]) - }, - icu_list - ); - } -} -- cgit v1.2.3