// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use alloc::borrow::Cow; use icu_provider::{yoke, zerofrom}; use regex_automata::dfa::sparse::DFA; /// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement /// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian /// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option`. /// ///
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, /// including in SemVer minor releases. While the serde representation of data structs is guaranteed /// to be stable, their Rust representation might not be. Use with caution. ///
#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] pub struct SerdeDFA<'data> { // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok()) dfa_bytes: Cow<'data, [u8]>, pattern: Option>, } #[cfg(feature = "datagen")] impl PartialEq for SerdeDFA<'_> { fn eq(&self, other: &Self) -> bool { self.dfa_bytes == other.dfa_bytes } } #[cfg(feature = "datagen")] impl databake::Bake for SerdeDFA<'_> { fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { env.insert("icu_list"); let le_bytes = self.deref().to_bytes_little_endian().as_slice().bake(env); let be_bytes = self.deref().to_bytes_big_endian().as_slice().bake(env); // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant. databake::quote! { unsafe { ::icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked( if cfg!(target_endian = "little") { &#le_bytes } else { &#be_bytes } ) } } } } #[cfg(feature = "datagen")] impl serde::Serialize for SerdeDFA<'_> { fn serialize(&self, serializer: S) -> Result where S: serde::ser::Serializer, { if serializer.is_human_readable() { self.pattern .as_ref() .map(|pattern| pattern.serialize(serializer)) .unwrap_or_else(|| { use serde::ser::Error; Err(S::Error::custom( "cannot serialize a deserialized bincode SerdeDFA to JSON", )) }) } else { self.deref().to_bytes_little_endian().serialize(serializer) } } } #[cfg(feature = "serde")] impl<'data> SerdeDFA<'data> { /// Deserializes to `Option`. Will return `None` for non-human-readable serialization /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive. pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result, D::Error> where D: serde::de::Deserializer<'de>, { use icu_provider::serde::borrow_de_utils::CowBytesWrap; use serde::Deserialize; #[cfg(feature = "serde_human")] if deserializer.is_human_readable() { #[cfg(not(feature = "std"))] use alloc::string::ToString; use serde::de::Error; return SerdeDFA::new(Cow::::deserialize(deserializer)?) .map(Some) .map_err(|e| D::Error::custom(e.to_string())); } let dfa_bytes = >::deserialize(deserializer)?.0; if cfg!(target_endian = "big") { return Ok(None); } // Verify safety invariant DFA::from_bytes(&dfa_bytes).map_err(|e| { use serde::de::Error; D::Error::custom(alloc::format!("Invalid DFA bytes: {}", e)) })?; Ok(Some(SerdeDFA { dfa_bytes, pattern: None, })) } } impl<'data> SerdeDFA<'data> { /// Creates a `SerdeDFA` from raw bytes. Used internally by databake. /// /// # Safety /// /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok()) pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self { Self { dfa_bytes: Cow::Borrowed(dfa_bytes), pattern: None, } } /// Creates a `SerdeDFA` from a regex. #[cfg(any(feature = "datagen", feature = "serde_human",))] pub fn new(pattern: Cow<'data, str>) -> Result { use regex_automata::{ dfa::dense::{Builder, Config}, SyntaxConfig, }; let mut builder = Builder::new(); let dfa = builder .syntax(SyntaxConfig::new().case_insensitive(true)) .configure(Config::new().anchored(true).minimize(true)) .build(&pattern) .map_err(|_| { icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern) })? .to_sparse() .map_err(|_| { icu_provider::DataError::custom("Cannot sparsify DFA") .with_display_context(&pattern) })?; Ok(Self { dfa_bytes: dfa.to_bytes_native_endian().into(), pattern: Some(pattern), }) } /// Returns the represented [`DFA`] #[allow(clippy::unwrap_used)] // by invariant pub fn deref(&'data self) -> DFA<&'data [u8]> { // Safe due to struct invariant. unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 } } } #[cfg(all(test, feature = "datagen"))] mod test { use super::*; #[test] fn test_serde_dfa() { use regex_automata::dfa::Automaton; let matcher = SerdeDFA::new(Cow::Borrowed("abc")).unwrap(); assert!(matcher.deref().find_earliest_fwd(b"ab").unwrap().is_none()); assert!(matcher.deref().find_earliest_fwd(b"abc").unwrap().is_some()); assert!(matcher .deref() .find_earliest_fwd(b"abcde") .unwrap() .is_some()); assert!(matcher .deref() .find_earliest_fwd(b" abcde") .unwrap() .is_none()); } #[derive(serde::Deserialize)] struct OptionSerdeDFA<'data>( #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")] Option>, ); #[test] #[cfg(target_endian = "little")] fn test_postcard_serialization() { let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap(); let mut bytes = postcard::to_stdvec(&matcher).unwrap(); assert_eq!( postcard::from_bytes::(&bytes).unwrap().0, Some(matcher) ); // A corrupted byte leads to an error bytes[17] ^= 255; assert!(postcard::from_bytes::(&bytes).is_err()); bytes[17] ^= 255; // An extra byte leads to an error bytes.insert(123, 40); assert!(postcard::from_bytes::(&bytes).is_err()); bytes.remove(123); // Missing bytes lead to an error assert!(postcard::from_bytes::(&bytes[0..bytes.len() - 5]).is_err()); } #[test] #[cfg(feature = "serde_human")] fn test_json_serialization() { let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap(); let json = serde_json::to_string(&matcher).unwrap(); assert_eq!( serde_json::from_str::(&json).unwrap().0, Some(matcher) ); assert!(serde_json::from_str::(".*[").is_err()); } #[test] #[ignore] // https://github.com/rust-lang/rust/issues/98906 fn databake() { databake::test_bake!( SerdeDFA, const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little") { &[1] // TODO: set this when activating the test } else { &[2] // TODO: set this when activating the test })}, icu_list ); } }