diff options
Diffstat (limited to '')
-rw-r--r-- | vendor/icu_list/src/provider.rs | 465 | ||||
-rw-r--r-- | vendor/icu_list/src/provider/mod.rs | 261 | ||||
-rw-r--r-- | vendor/icu_list/src/provider/serde_dfa.rs | 244 |
3 files changed, 505 insertions, 465 deletions
diff --git a/vendor/icu_list/src/provider.rs b/vendor/icu_list/src/provider.rs deleted file mode 100644 index 27f3e4fec..000000000 --- a/vendor/icu_list/src/provider.rs +++ /dev/null @@ -1,465 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -// Provider structs must be stable -#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] - -//! Data provider struct definitions for this ICU4X component. -//! -//! Read more about data providers: [`icu_provider`] - -use crate::ListLength; -use alloc::borrow::Cow; -use icu_provider::DataMarker; -use icu_provider::{yoke, zerofrom}; -use writeable::{LengthHint, Writeable}; - -pub use crate::string_matcher::StringMatcher; - -/// Symbols and metadata required for [`ListFormatter`](crate::ListFormatter). -#[icu_provider::data_struct( - AndListV1Marker = "list/and@1", - OrListV1Marker = "list/or@1", - UnitListV1Marker = "list/unit@1" -)] -#[derive(Clone, Debug)] -#[cfg_attr( - feature = "datagen", - derive(serde::Serialize, databake::Bake), - databake(path = icu_list::provider), -)] -pub struct ListFormatterPatternsV1<'data>( - #[cfg_attr(feature = "datagen", serde(with = "deduplicating_array"))] - /// The patterns in the order start, middle, end, pair, short_start, short_middle, - /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair, - pub [ConditionalListJoinerPattern<'data>; 12], -); - -#[cfg(feature = "serde")] -impl<'de> serde::Deserialize<'de> for ListFormatterPatternsV1<'de> { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::de::Deserializer<'de>, - { - #[cfg(not(feature = "serde_human"))] - if deserializer.is_human_readable() { - use serde::de::Error; - return Err(D::Error::custom( - "Deserializing human-readable ListFormatter data requires the 'serde_human' feature", - )); - } - - Ok(ListFormatterPatternsV1(deduplicating_array::deserialize( - deserializer, - )?)) - } -} - -pub(crate) struct ErasedListV1Marker; - -impl DataMarker for ErasedListV1Marker { - type Yokeable = ListFormatterPatternsV1<'static>; -} - -impl<'data> ListFormatterPatternsV1<'data> { - pub(crate) fn start(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { - #![allow(clippy::indexing_slicing)] // style as usize < 3 - &self.0[4 * (style as usize)] - } - - pub(crate) fn middle(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { - #![allow(clippy::indexing_slicing)] // style as usize < 3 - &self.0[4 * (style as usize) + 1] - } - - pub(crate) fn end(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { - #![allow(clippy::indexing_slicing)] // style as usize < 3 - &self.0[4 * (style as usize) + 2] - } - - pub(crate) fn pair(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { - #![allow(clippy::indexing_slicing)] // style as usize < 3 - &self.0[4 * (style as usize) + 3] - } - - /// The range of the number of bytes required by the list literals to join a - /// list of length `len`. If none of the patterns are conditional, this is exact. - pub(crate) fn size_hint(&self, style: ListLength, len: usize) -> LengthHint { - match len { - 0 | 1 => LengthHint::exact(0), - 2 => self.pair(style).size_hint(), - n => { - self.start(style).size_hint() - + self.middle(style).size_hint() * (n - 3) - + self.end(style).size_hint() - } - } - } -} - -/// A pattern that can behave conditionally on the next element. -#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] -#[cfg_attr( - feature = "datagen", - derive(serde::Serialize, databake::Bake), - databake(path = icu_list::provider), -)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize))] -pub struct ConditionalListJoinerPattern<'data> { - /// The default pattern - #[cfg_attr(feature = "serde", serde(borrow))] - pub default: ListJoinerPattern<'data>, - /// And optional special case - #[cfg_attr(feature = "serde", serde(borrow))] - pub special_case: Option<SpecialCasePattern<'data>>, -} - -/// The special case of a [`ConditionalListJoinerPattern`] -#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] -#[cfg_attr( - feature = "datagen", - derive(serde::Serialize, databake::Bake), - databake(path = icu_list::provider), -)] -#[cfg_attr(feature = "serde", derive(serde::Deserialize))] -pub struct SpecialCasePattern<'data> { - /// The condition on the following element - #[cfg_attr(feature = "serde", serde(borrow))] - pub condition: StringMatcher<'data>, - /// The pattern if the condition matches - #[cfg_attr(feature = "serde", serde(borrow))] - pub pattern: ListJoinerPattern<'data>, -} - -/// A pattern containing two numeric placeholders ("{0}, and {1}.") -#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] -#[cfg_attr(feature = "datagen", derive(serde::Serialize))] -pub struct ListJoinerPattern<'data> { - /// The pattern string without the placeholders - string: Cow<'data, str>, - /// The index of the first placeholder. Always <= index_1. - // Always 0 for CLDR data, so we don't need to serialize it. - // In-memory we have free space for it as index_1 doesn't - // fill a word. - #[cfg_attr(feature = "datagen", serde(skip))] - index_0: u8, - /// The index of the second placeholder. Always < string.len(). - index_1: u8, -} - -#[cfg(feature = "serde")] -impl<'de: 'data, 'data> serde::Deserialize<'de> for ListJoinerPattern<'data> { - fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> - where - D: serde::Deserializer<'de>, - { - #[derive(serde::Deserialize)] - struct Dummy<'data> { - #[cfg_attr(feature = "serde", serde(borrow))] - string: Cow<'data, str>, - index_1: u8, - } - let Dummy { string, index_1 } = Dummy::deserialize(deserializer)?; - - if index_1 as usize > string.len() { - use serde::de::Error; - Err(D::Error::custom("invalid index_1")) - } else { - Ok(ListJoinerPattern { - string, - index_0: 0, - index_1, - }) - } - } -} - -impl<'a> ListJoinerPattern<'a> { - /// Constructs a [`ListJoinerPattern`] from raw parts. Used by databake. - /// - /// # Safety - /// index_1 may be at most string.len() - pub const unsafe fn from_parts_unchecked(string: &'a str, index_1: u8) -> Self { - Self { - string: Cow::Borrowed(string), - index_0: 0, - index_1, - } - } -} - -pub(crate) type PatternParts<'a> = (&'a str, &'a str, &'a str); - -impl<'a> ConditionalListJoinerPattern<'a> { - pub(crate) fn parts<'b, W: Writeable + ?Sized>( - &'a self, - following_value: &'b W, - ) -> PatternParts<'a> { - match &self.special_case { - Some(SpecialCasePattern { condition, pattern }) - // TODO: Implement lookahead instead of materializing here. - if condition.test(&*following_value.write_to_string()) => - { - pattern.borrow_tuple() - } - _ => self.default.borrow_tuple(), - } - } - - /// The expected length of this pattern - pub fn size_hint(&'a self) -> LengthHint { - let mut hint = self.default.size_hint(); - if let Some(special_case) = &self.special_case { - hint |= special_case.pattern.size_hint() - } - hint - } -} - -impl<'data> ListJoinerPattern<'data> { - fn borrow_tuple(&'data self) -> PatternParts<'data> { - #![allow(clippy::indexing_slicing)] // by invariant - let index_0 = self.index_0 as usize; - let index_1 = self.index_1 as usize; - ( - &self.string[0..index_0], - &self.string[index_0..index_1], - &self.string[index_1..], - ) - } - - fn size_hint(&self) -> LengthHint { - LengthHint::exact(self.string.len()) - } -} - -#[cfg(feature = "datagen")] -mod datagen { - #![allow(clippy::indexing_slicing)] // datagen - - use super::*; - use icu_provider::DataError; - - impl<'data> ListFormatterPatternsV1<'data> { - /// The patterns in the order start, middle, end, pair, short_start, short_middle, - /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair, - pub fn try_new(patterns: [&str; 12]) -> Result<Self, DataError> { - Ok(Self([ - ListJoinerPattern::from_str(patterns[0], true, false)?.into(), - ListJoinerPattern::from_str(patterns[1], false, false)?.into(), - ListJoinerPattern::from_str(patterns[2], false, true)?.into(), - ListJoinerPattern::from_str(patterns[3], true, true)?.into(), - ListJoinerPattern::from_str(patterns[4], true, false)?.into(), - ListJoinerPattern::from_str(patterns[5], false, false)?.into(), - ListJoinerPattern::from_str(patterns[6], false, true)?.into(), - ListJoinerPattern::from_str(patterns[7], true, true)?.into(), - ListJoinerPattern::from_str(patterns[8], true, false)?.into(), - ListJoinerPattern::from_str(patterns[9], false, false)?.into(), - ListJoinerPattern::from_str(patterns[10], false, true)?.into(), - ListJoinerPattern::from_str(patterns[11], true, true)?.into(), - ])) - } - - /// Adds a special case to all `pattern`s that will evaluate to - /// `alternative_pattern` when `regex` matches the following element. - /// The regex is interpreted case-insensitive and anchored to the beginning, but - /// to improve efficiency does not search for full matches. If a full match is - /// required, use `$`. - pub fn make_conditional( - &mut self, - pattern: &str, - regex: &StringMatcher<'static>, - alternative_pattern: &str, - ) -> Result<(), DataError> { - let old = ListJoinerPattern::from_str(pattern, true, true)?; - for i in 0..12 { - if self.0[i].default == old { - self.0[i].special_case = Some(SpecialCasePattern { - condition: regex.clone(), - pattern: ListJoinerPattern::from_str( - alternative_pattern, - i % 4 == 0 || i % 4 == 3, // allow_prefix = start or pair - i % 4 == 2 || i % 4 == 3, // allow_suffix = end or pair - )?, - }); - } - } - Ok(()) - } - } - - impl<'data> ListJoinerPattern<'data> { - /// Construct the pattern from a CLDR pattern string - pub fn from_str( - pattern: &str, - allow_prefix: bool, - allow_suffix: bool, - ) -> Result<Self, DataError> { - match (pattern.find("{0}"), pattern.find("{1}")) { - (Some(index_0), Some(index_1)) - if index_0 < index_1 - && (allow_prefix || index_0 == 0) - && (allow_suffix || index_1 == pattern.len() - 3) => - { - if (index_0 > 0 && !cfg!(test)) || index_1 - 3 >= 256 { - return Err(DataError::custom( - "Found valid pattern that cannot be stored in ListFormatterPatternsV1", - ) - .with_debug_context(pattern)); - } - Ok(ListJoinerPattern { - string: Cow::Owned(alloc::format!( - "{}{}{}", - &pattern[0..index_0], - &pattern[index_0 + 3..index_1], - &pattern[index_1 + 3..] - )), - index_0: index_0 as u8, - index_1: (index_1 - 3) as u8, - }) - } - _ => Err(DataError::custom("Invalid list pattern").with_debug_context(pattern)), - } - } - } - - impl<'data> From<ListJoinerPattern<'data>> for ConditionalListJoinerPattern<'data> { - fn from(default: ListJoinerPattern<'data>) -> Self { - Self { - default, - special_case: None, - } - } - } - - impl databake::Bake for ListJoinerPattern<'_> { - fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { - env.insert("icu_list"); - let string = (&*self.string).bake(env); - let index_1 = self.index_1.bake(env); - // Safe because our own data is safe - databake::quote! { unsafe { - ::icu_list::provider::ListJoinerPattern::from_parts_unchecked(#string, #index_1) - }} - } - } -} - -#[cfg(all(test, feature = "datagen"))] -pub(crate) mod test { - use super::*; - - pub fn test_patterns() -> ListFormatterPatternsV1<'static> { - let mut patterns = ListFormatterPatternsV1::try_new([ - // Wide: general - "@{0}:{1}", - "{0},{1}", - "{0}.{1}!", - "${0};{1}+", - // Short: different pattern lengths - "{0}1{1}", - "{0}12{1}", - "{0}12{1}34", - "{0}123{1}456", - // Narrow: conditionals - "{0}: {1}", - "{0}, {1}", - "{0}. {1}", - "{0}. {1}", - ]) - .unwrap(); - patterns - .make_conditional("{0}. {1}", &StringMatcher::new("A").unwrap(), "{0} :o {1}") - .unwrap(); - patterns - } - - #[test] - fn rejects_bad_patterns() { - assert!(ListJoinerPattern::from_str("{0} and", true, true).is_err()); - assert!(ListJoinerPattern::from_str("and {1}", true, true).is_err()); - assert!(ListJoinerPattern::from_str("{1} and {0}", true, true).is_err()); - assert!(ListJoinerPattern::from_str("{1{0}}", true, true).is_err()); - assert!(ListJoinerPattern::from_str("{0\u{202e}} and {1}", true, true).is_err()); - assert!(ListJoinerPattern::from_str("{{0}} {{1}}", true, true).is_ok()); - - assert!(ListJoinerPattern::from_str("{0} and {1} ", true, true).is_ok()); - assert!(ListJoinerPattern::from_str("{0} and {1} ", true, false).is_err()); - assert!(ListJoinerPattern::from_str(" {0} and {1}", true, true).is_ok()); - assert!(ListJoinerPattern::from_str(" {0} and {1}", false, true).is_err()); - } - - #[test] - fn produces_correct_parts() { - assert_eq!( - test_patterns().pair(ListLength::Wide).parts(""), - ("$", ";", "+") - ); - } - - #[test] - fn produces_correct_parts_conditionally() { - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("A"), - ("", " :o ", "") - ); - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("a"), - ("", " :o ", "") - ); - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("ab"), - ("", " :o ", "") - ); - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("B"), - ("", ". ", "") - ); - assert_eq!( - test_patterns().end(ListLength::Narrow).parts("BA"), - ("", ". ", "") - ); - } - - #[test] - fn size_hint_works() { - let pattern = test_patterns(); - - assert_eq!( - pattern.size_hint(ListLength::Short, 0), - LengthHint::exact(0) - ); - assert_eq!( - pattern.size_hint(ListLength::Short, 1), - LengthHint::exact(0) - ); - - // pair pattern "{0}123{1}456" - assert_eq!( - pattern.size_hint(ListLength::Short, 2), - LengthHint::exact(6) - ); - - // patterns "{0}1{1}", "{0}12{1}" (x197), and "{0}12{1}34" - assert_eq!( - pattern.size_hint(ListLength::Short, 200), - LengthHint::exact(1 + 2 * 197 + 4) - ); - - // patterns "{0}: {1}", "{0}, {1}" (x197), and "{0} :o {1}" or "{0}. {1}" - assert_eq!( - pattern.size_hint(ListLength::Narrow, 200), - LengthHint::exact(2 + 197 * 2) + LengthHint::between(2, 4) - ); - } - - #[test] - fn databake() { - databake::test_bake!( - ListJoinerPattern, - const: unsafe { crate::provider::ListJoinerPattern::from_parts_unchecked(", ", 2u8) }, - icu_list - ); - } -} diff --git a/vendor/icu_list/src/provider/mod.rs b/vendor/icu_list/src/provider/mod.rs new file mode 100644 index 000000000..efab7c8bc --- /dev/null +++ b/vendor/icu_list/src/provider/mod.rs @@ -0,0 +1,261 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// Provider structs must be stable +#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] + +//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. +//! +//! <div class="stab unstable"> +//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +//! including in SemVer minor releases. While the serde representation of data structs is guaranteed +//! to be stable, their Rust representation might not be. Use with caution. +//! </div> +//! +//! Read more about data providers: [`icu_provider`] + +use crate::ListLength; +use alloc::borrow::Cow; +use icu_provider::DataMarker; +use icu_provider::{yoke, zerofrom}; + +mod serde_dfa; +pub use serde_dfa::SerdeDFA; + +/// Symbols and metadata required for [`ListFormatter`](crate::ListFormatter). +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[icu_provider::data_struct( + AndListV1Marker = "list/and@1", + OrListV1Marker = "list/or@1", + UnitListV1Marker = "list/unit@1" +)] +#[derive(Clone, Debug)] +#[cfg_attr( + feature = "datagen", + derive(serde::Serialize, databake::Bake), + databake(path = icu_list::provider), +)] +pub struct ListFormatterPatternsV1<'data>( + #[cfg_attr(feature = "datagen", serde(with = "deduplicating_array"))] + /// The patterns in the order start, middle, end, pair, short_start, short_middle, + /// short_end, short_pair, narrow_start, narrow_middle, narrow_end, narrow_pair, + pub [ConditionalListJoinerPattern<'data>; 12], +); + +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for ListFormatterPatternsV1<'de> { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::de::Deserializer<'de>, + { + #[cfg(not(feature = "serde_human"))] + if deserializer.is_human_readable() { + use serde::de::Error; + return Err(D::Error::custom( + "Deserializing human-readable ListFormatter data requires the 'serde_human' feature", + )); + } + + Ok(ListFormatterPatternsV1(deduplicating_array::deserialize( + deserializer, + )?)) + } +} + +pub(crate) struct ErasedListV1Marker; + +impl DataMarker for ErasedListV1Marker { + type Yokeable = ListFormatterPatternsV1<'static>; +} + +impl<'data> ListFormatterPatternsV1<'data> { + pub(crate) fn start(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { + #![allow(clippy::indexing_slicing)] // style as usize < 3 + &self.0[4 * (style as usize)] + } + + pub(crate) fn middle(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { + #![allow(clippy::indexing_slicing)] // style as usize < 3 + &self.0[4 * (style as usize) + 1] + } + + pub(crate) fn end(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { + #![allow(clippy::indexing_slicing)] // style as usize < 3 + &self.0[4 * (style as usize) + 2] + } + + pub(crate) fn pair(&self, style: ListLength) -> &ConditionalListJoinerPattern<'data> { + #![allow(clippy::indexing_slicing)] // style as usize < 3 + &self.0[4 * (style as usize) + 3] + } +} + +/// A pattern that can behave conditionally on the next element. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr( + feature = "datagen", + derive(PartialEq, serde::Serialize, databake::Bake), + databake(path = icu_list::provider), +)] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +pub struct ConditionalListJoinerPattern<'data> { + /// The default pattern + #[cfg_attr(feature = "serde", serde(borrow))] + pub default: ListJoinerPattern<'data>, + /// And optional special case + #[cfg_attr( + feature = "serde", + serde(borrow, deserialize_with = "SpecialCasePattern::deserialize_option") + )] + pub special_case: Option<SpecialCasePattern<'data>>, +} + +/// The special case of a [`ConditionalListJoinerPattern`] +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr( + feature = "datagen", + derive(PartialEq, serde::Serialize, databake::Bake), + databake(path = icu_list::provider), +)] +pub struct SpecialCasePattern<'data> { + /// The condition on the following element + pub condition: SerdeDFA<'data>, + /// The pattern if the condition matches + pub pattern: ListJoinerPattern<'data>, +} + +#[cfg(feature = "serde")] +impl<'data> SpecialCasePattern<'data> { + // If the condition doesn't deserialize, the whole special case becomes `None` + fn deserialize_option<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error> + where + D: serde::de::Deserializer<'de>, + { + use serde::Deserialize; + + #[derive(Deserialize)] + struct SpecialCasePatternOptionalDfa<'data> { + #[cfg_attr( + feature = "serde", + serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize") + )] + pub condition: Option<SerdeDFA<'data>>, + #[cfg_attr(feature = "serde", serde(borrow))] + pub pattern: ListJoinerPattern<'data>, + } + + Ok( + match Option::<SpecialCasePatternOptionalDfa<'data>>::deserialize(deserializer)? { + Some(SpecialCasePatternOptionalDfa { + condition: Some(condition), + pattern, + }) => Some(SpecialCasePattern { condition, pattern }), + _ => None, + }, + ) + } +} + +/// A pattern containing two numeric placeholders ("{0}, and {1}.") +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Clone, Debug, PartialEq, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr(feature = "datagen", derive(serde::Serialize))] +pub struct ListJoinerPattern<'data> { + /// The pattern string without the placeholders + pub(crate) string: Cow<'data, str>, + /// The index of the first placeholder. Always <= index_1. + // Always 0 for CLDR data, so we don't need to serialize it. + // In-memory we have free space for it as index_1 doesn't + // fill a word. + #[cfg_attr(feature = "datagen", serde(skip))] + pub(crate) index_0: u8, + /// The index of the second placeholder. Always < string.len(). + pub(crate) index_1: u8, +} + +#[cfg(feature = "serde")] +impl<'de: 'data, 'data> serde::Deserialize<'de> for ListJoinerPattern<'data> { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + #[derive(serde::Deserialize)] + struct Dummy<'data> { + #[cfg_attr(feature = "serde", serde(borrow))] + string: Cow<'data, str>, + index_1: u8, + } + let Dummy { string, index_1 } = Dummy::deserialize(deserializer)?; + + if index_1 as usize > string.len() { + use serde::de::Error; + Err(D::Error::custom("invalid index_1")) + } else { + Ok(ListJoinerPattern { + string, + index_0: 0, + index_1, + }) + } + } +} + +impl<'a> ListJoinerPattern<'a> { + /// Constructs a [`ListJoinerPattern`] from raw parts. Used by databake. + /// + /// # Safety + /// index_1 may be at most string.len() + pub const unsafe fn from_parts_unchecked(string: &'a str, index_1: u8) -> Self { + Self { + string: Cow::Borrowed(string), + index_0: 0, + index_1, + } + } +} + +#[cfg(feature = "datagen")] +impl databake::Bake for ListJoinerPattern<'_> { + fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { + env.insert("icu_list"); + let string = (&*self.string).bake(env); + let index_1 = self.index_1.bake(env); + // Safe because our own data is safe + databake::quote! { unsafe { + ::icu_list::provider::ListJoinerPattern::from_parts_unchecked(#string, #index_1) + }} + } +} + +#[cfg(all(test, feature = "datagen"))] +#[test] +fn databake() { + databake::test_bake!( + ListJoinerPattern, + const: unsafe { crate::provider::ListJoinerPattern::from_parts_unchecked(", ", 2u8) }, + icu_list + ); +} diff --git a/vendor/icu_list/src/provider/serde_dfa.rs b/vendor/icu_list/src/provider/serde_dfa.rs new file mode 100644 index 000000000..e2424e1e9 --- /dev/null +++ b/vendor/icu_list/src/provider/serde_dfa.rs @@ -0,0 +1,244 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use alloc::borrow::Cow; +use icu_provider::{yoke, zerofrom}; +use regex_automata::dfa::sparse::DFA; + +/// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement +/// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian +/// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] +pub struct SerdeDFA<'data> { + // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok()) + dfa_bytes: Cow<'data, [u8]>, + pattern: Option<Cow<'data, str>>, +} + +#[cfg(feature = "datagen")] +impl PartialEq for SerdeDFA<'_> { + fn eq(&self, other: &Self) -> bool { + self.dfa_bytes == other.dfa_bytes + } +} + +#[cfg(feature = "datagen")] +impl databake::Bake for SerdeDFA<'_> { + fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { + env.insert("icu_list"); + let le_bytes = self.deref().to_bytes_little_endian().as_slice().bake(env); + let be_bytes = self.deref().to_bytes_big_endian().as_slice().bake(env); + // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant. + databake::quote! { + unsafe { + ::icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked( + if cfg!(target_endian = "little") { + &#le_bytes + } else { + &#be_bytes + } + ) + } + } + } +} + +#[cfg(feature = "datagen")] +impl serde::Serialize for SerdeDFA<'_> { + fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> + where + S: serde::ser::Serializer, + { + if serializer.is_human_readable() { + self.pattern + .as_ref() + .map(|pattern| pattern.serialize(serializer)) + .unwrap_or_else(|| { + use serde::ser::Error; + Err(S::Error::custom( + "cannot serialize a deserialized bincode SerdeDFA to JSON", + )) + }) + } else { + self.deref().to_bytes_little_endian().serialize(serializer) + } + } +} + +#[cfg(feature = "serde")] +impl<'data> SerdeDFA<'data> { + /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization + /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive. + pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error> + where + D: serde::de::Deserializer<'de>, + { + use icu_provider::serde::borrow_de_utils::CowBytesWrap; + use serde::Deserialize; + + #[cfg(feature = "serde_human")] + if deserializer.is_human_readable() { + #[cfg(not(feature = "std"))] + use alloc::string::ToString; + use serde::de::Error; + return SerdeDFA::new(Cow::<str>::deserialize(deserializer)?) + .map(Some) + .map_err(|e| D::Error::custom(e.to_string())); + } + + let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0; + + if cfg!(target_endian = "big") { + return Ok(None); + } + + // Verify safety invariant + DFA::from_bytes(&dfa_bytes).map_err(|e| { + use serde::de::Error; + D::Error::custom(alloc::format!("Invalid DFA bytes: {}", e)) + })?; + + Ok(Some(SerdeDFA { + dfa_bytes, + pattern: None, + })) + } +} + +impl<'data> SerdeDFA<'data> { + /// Creates a `SerdeDFA` from raw bytes. Used internally by databake. + /// + /// # Safety + /// + /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok()) + pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self { + Self { + dfa_bytes: Cow::Borrowed(dfa_bytes), + pattern: None, + } + } + + /// Creates a `SerdeDFA` from a regex. + #[cfg(any(feature = "datagen", feature = "serde_human",))] + pub fn new(pattern: Cow<'data, str>) -> Result<Self, icu_provider::DataError> { + use regex_automata::{ + dfa::dense::{Builder, Config}, + SyntaxConfig, + }; + + let mut builder = Builder::new(); + let dfa = builder + .syntax(SyntaxConfig::new().case_insensitive(true)) + .configure(Config::new().anchored(true).minimize(true)) + .build(&pattern) + .map_err(|_| { + icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern) + })? + .to_sparse() + .map_err(|_| { + icu_provider::DataError::custom("Cannot sparsify DFA") + .with_display_context(&pattern) + })?; + + Ok(Self { + dfa_bytes: dfa.to_bytes_native_endian().into(), + pattern: Some(pattern), + }) + } + + /// Returns the represented [`DFA`] + #[allow(clippy::unwrap_used)] // by invariant + pub fn deref(&'data self) -> DFA<&'data [u8]> { + // Safe due to struct invariant. + unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 } + } +} + +#[cfg(all(test, feature = "datagen"))] +mod test { + use super::*; + + #[test] + fn test_serde_dfa() { + use regex_automata::dfa::Automaton; + + let matcher = SerdeDFA::new(Cow::Borrowed("abc")).unwrap(); + + assert!(matcher.deref().find_earliest_fwd(b"ab").unwrap().is_none()); + assert!(matcher.deref().find_earliest_fwd(b"abc").unwrap().is_some()); + assert!(matcher + .deref() + .find_earliest_fwd(b"abcde") + .unwrap() + .is_some()); + assert!(matcher + .deref() + .find_earliest_fwd(b" abcde") + .unwrap() + .is_none()); + } + + #[derive(serde::Deserialize)] + struct OptionSerdeDFA<'data>( + #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")] Option<SerdeDFA<'data>>, + ); + + #[test] + #[cfg(target_endian = "little")] + fn test_postcard_serialization() { + let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap(); + + let mut bytes = postcard::to_stdvec(&matcher).unwrap(); + assert_eq!( + postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0, + Some(matcher) + ); + + // A corrupted byte leads to an error + bytes[17] ^= 255; + assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err()); + bytes[17] ^= 255; + + // An extra byte leads to an error + bytes.insert(123, 40); + assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err()); + bytes.remove(123); + + // Missing bytes lead to an error + assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err()); + } + + #[test] + #[cfg(feature = "serde_human")] + fn test_json_serialization() { + let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap(); + + let json = serde_json::to_string(&matcher).unwrap(); + assert_eq!( + serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0, + Some(matcher) + ); + assert!(serde_json::from_str::<OptionSerdeDFA>(".*[").is_err()); + } + + #[test] + #[ignore] // https://github.com/rust-lang/rust/issues/98906 + fn databake() { + databake::test_bake!( + SerdeDFA, + const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little") { + &[1] // TODO: set this when activating the test + } else { + &[2] // TODO: set this when activating the test + })}, + icu_list + ); + } +} |