// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use alloc::borrow::Cow; #[cfg(any(feature = "serde_human", feature = "datagen"))] use alloc::string::ToString; use icu_provider::{yoke, zerofrom}; use regex_automata::dfa::sparse::DFA; use regex_automata::dfa::Automaton; /// A precompiled regex #[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)] #[allow(clippy::exhaustive_structs)] // not a public API pub struct StringMatcher<'data> { // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok()) dfa_bytes: Cow<'data, [u8]>, pattern: Option>, } impl PartialEq for StringMatcher<'_> { fn eq(&self, other: &Self) -> bool { self.dfa_bytes == other.dfa_bytes } } #[cfg(feature = "datagen")] impl databake::Bake for StringMatcher<'_> { fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { env.insert("icu_list"); let bytes = (&&*self.dfa_bytes).bake(env); // Safe because our own data is safe databake::quote! { unsafe { ::icu_list::provider::StringMatcher::from_dfa_bytes_unchecked(#bytes) } } } } #[cfg(feature = "datagen")] impl serde::Serialize for StringMatcher<'_> { fn serialize(&self, serializer: S) -> Result where S: serde::ser::Serializer, { if serializer.is_human_readable() { self.pattern .as_ref() .map(|pattern| pattern.serialize(serializer)) .unwrap_or_else(|| { use serde::ser::Error; Err(S::Error::custom( "cannot serialize a deserialized bincode StringMatcher to JSON", )) }) } else { self.dfa_bytes.serialize(serializer) } } } #[cfg(feature = "serde")] impl<'de: 'data, 'data> serde::Deserialize<'de> for StringMatcher<'data> { fn deserialize(deserializer: D) -> Result where D: serde::de::Deserializer<'de>, { use icu_provider::serde::borrow_de_utils::CowBytesWrap; #[cfg(feature = "serde_human")] if deserializer.is_human_readable() { use serde::de::Error; return StringMatcher::new(<&str>::deserialize(deserializer)?) .map_err(|e| D::Error::custom(e.to_string())); } if cfg!(target_endian = "big") { // TODO: Convert LE to BE. For now we just behave like the // accept-nothing DFA on BE systems. return Ok(StringMatcher { dfa_bytes: Cow::Borrowed(&[]), pattern: None, }); } let dfa_bytes = >::deserialize(deserializer)?.0; // Verify safety invariant DFA::from_bytes(&dfa_bytes).map_err(|e| { use serde::de::Error; D::Error::custom(alloc::format!("Invalid DFA bytes: {}", e)) })?; Ok(StringMatcher { dfa_bytes, pattern: None, }) } } impl<'data> StringMatcher<'data> { /// Creates a `StringMatcher` from a serialized DFA. Used internally by databake. /// /// # Safety /// /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok()) pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self { Self { dfa_bytes: Cow::Borrowed(dfa_bytes), pattern: None, } } /// Creates a `StringMatcher` from regex. #[cfg(any(feature = "datagen", feature = "serde_human",))] pub fn new(pattern: &str) -> Result { use regex_automata::{ dfa::dense::{Builder, Config}, SyntaxConfig, }; let mut builder = Builder::new(); let dfa = builder .syntax(SyntaxConfig::new().case_insensitive(true)) .configure(Config::new().anchored(true).minimize(true)) .build(pattern) .map_err(|_| { icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern) })? .to_sparse() .map_err(|_| { icu_provider::DataError::custom("Cannot sparsify DFA") .with_display_context(&pattern) })?; Ok(Self { dfa_bytes: dfa.to_bytes_little_endian().into(), pattern: Some(pattern.to_string().into()), }) } #[allow(clippy::unwrap_used)] // by invariant pub(crate) fn test(&self, string: &str) -> bool { cfg!(target_endian = "little") && matches!( // Safe due to struct invariant. unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 } .find_earliest_fwd(string.as_bytes()), Ok(Some(_)) ) } } #[cfg(all(test, feature = "datagen"))] mod test { use super::*; #[test] fn test_string_matcher() { let matcher = StringMatcher::new("abc.*").unwrap(); assert!(!matcher.test("ab")); assert!(matcher.test("abc")); assert!(matcher.test("abcde")); } #[test] fn test_postcard_serialization() { let matcher = StringMatcher::new("abc*").unwrap(); let mut bytes = postcard::to_stdvec(&matcher).unwrap(); assert_eq!( postcard::from_bytes::(&bytes).unwrap(), matcher ); // A corrupted byte leads to an error bytes[17] ^= 255; assert!(postcard::from_bytes::(&bytes).is_err()); bytes[17] ^= 255; // An extra byte leads to an error bytes.insert(123, 40); assert!(postcard::from_bytes::(&bytes).is_err()); bytes.remove(123); // Missing bytes lead to an error assert!(postcard::from_bytes::(&bytes[0..bytes.len() - 5]).is_err()); } #[test] #[cfg(feature = "serde_human")] fn test_json_serialization() { let matcher = StringMatcher::new("abc*").unwrap(); let json = serde_json::to_string(&matcher).unwrap(); assert_eq!( serde_json::from_str::(&json).unwrap(), matcher ); assert!(serde_json::from_str::(".*[").is_err()); } #[test] #[ignore] // https://github.com/rust-lang/rust/issues/98906 fn databake() { databake::test_bake!( StringMatcher, const: unsafe { crate::provider::StringMatcher::from_dfa_bytes_unchecked(&[49u8, 50u8, 51u8, ]) }, icu_list ); } }