summaryrefslogtreecommitdiffstats
path: root/vendor/icu_list/src/provider/serde_dfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/icu_list/src/provider/serde_dfa.rs')
-rw-r--r--vendor/icu_list/src/provider/serde_dfa.rs244
1 files changed, 244 insertions, 0 deletions
diff --git a/vendor/icu_list/src/provider/serde_dfa.rs b/vendor/icu_list/src/provider/serde_dfa.rs
new file mode 100644
index 000000000..e2424e1e9
--- /dev/null
+++ b/vendor/icu_list/src/provider/serde_dfa.rs
@@ -0,0 +1,244 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use alloc::borrow::Cow;
+use icu_provider::{yoke, zerofrom};
+use regex_automata::dfa::sparse::DFA;
+
+/// A serde-compatible version of [regex_automata::dfa::sparse::DFA]. This does not implement
+/// [`serde::Deserialize`] directly, as binary deserialization is not supported in big-endian
+/// platforms. `Self::maybe_deserialize` can be used to deserialize to `Option<SerdeDFA>`.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Clone, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
+pub struct SerdeDFA<'data> {
+ // Safety: These always represent a valid DFA (DFA::from_bytes(dfa_bytes).is_ok())
+ dfa_bytes: Cow<'data, [u8]>,
+ pattern: Option<Cow<'data, str>>,
+}
+
+#[cfg(feature = "datagen")]
+impl PartialEq for SerdeDFA<'_> {
+ fn eq(&self, other: &Self) -> bool {
+ self.dfa_bytes == other.dfa_bytes
+ }
+}
+
+#[cfg(feature = "datagen")]
+impl databake::Bake for SerdeDFA<'_> {
+ fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
+ env.insert("icu_list");
+ let le_bytes = self.deref().to_bytes_little_endian().as_slice().bake(env);
+ let be_bytes = self.deref().to_bytes_big_endian().as_slice().bake(env);
+ // Safe because of `to_bytes_little_endian`/`to_bytes_big_endian`'s invariant.
+ databake::quote! {
+ unsafe {
+ ::icu_list::provider::SerdeDFA::from_dfa_bytes_unchecked(
+ if cfg!(target_endian = "little") {
+ &#le_bytes
+ } else {
+ &#be_bytes
+ }
+ )
+ }
+ }
+ }
+}
+
+#[cfg(feature = "datagen")]
+impl serde::Serialize for SerdeDFA<'_> {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::ser::Serializer,
+ {
+ if serializer.is_human_readable() {
+ self.pattern
+ .as_ref()
+ .map(|pattern| pattern.serialize(serializer))
+ .unwrap_or_else(|| {
+ use serde::ser::Error;
+ Err(S::Error::custom(
+ "cannot serialize a deserialized bincode SerdeDFA to JSON",
+ ))
+ })
+ } else {
+ self.deref().to_bytes_little_endian().serialize(serializer)
+ }
+ }
+}
+
+#[cfg(feature = "serde")]
+impl<'data> SerdeDFA<'data> {
+ /// Deserializes to `Option<Self>`. Will return `None` for non-human-readable serialization
+ /// formats on big-endian systems, as `regex_automata` serialization is endian-sensitive.
+ pub fn maybe_deserialize<'de: 'data, D>(deserializer: D) -> Result<Option<Self>, D::Error>
+ where
+ D: serde::de::Deserializer<'de>,
+ {
+ use icu_provider::serde::borrow_de_utils::CowBytesWrap;
+ use serde::Deserialize;
+
+ #[cfg(feature = "serde_human")]
+ if deserializer.is_human_readable() {
+ #[cfg(not(feature = "std"))]
+ use alloc::string::ToString;
+ use serde::de::Error;
+ return SerdeDFA::new(Cow::<str>::deserialize(deserializer)?)
+ .map(Some)
+ .map_err(|e| D::Error::custom(e.to_string()));
+ }
+
+ let dfa_bytes = <CowBytesWrap<'de>>::deserialize(deserializer)?.0;
+
+ if cfg!(target_endian = "big") {
+ return Ok(None);
+ }
+
+ // Verify safety invariant
+ DFA::from_bytes(&dfa_bytes).map_err(|e| {
+ use serde::de::Error;
+ D::Error::custom(alloc::format!("Invalid DFA bytes: {}", e))
+ })?;
+
+ Ok(Some(SerdeDFA {
+ dfa_bytes,
+ pattern: None,
+ }))
+ }
+}
+
+impl<'data> SerdeDFA<'data> {
+ /// Creates a `SerdeDFA` from raw bytes. Used internally by databake.
+ ///
+ /// # Safety
+ ///
+ /// `dfa_bytes` has to be a valid DFA (regex_automata::dfa::sparse::DFA::from_bytes(dfa_bytes).is_ok())
+ pub const unsafe fn from_dfa_bytes_unchecked(dfa_bytes: &'data [u8]) -> Self {
+ Self {
+ dfa_bytes: Cow::Borrowed(dfa_bytes),
+ pattern: None,
+ }
+ }
+
+ /// Creates a `SerdeDFA` from a regex.
+ #[cfg(any(feature = "datagen", feature = "serde_human",))]
+ pub fn new(pattern: Cow<'data, str>) -> Result<Self, icu_provider::DataError> {
+ use regex_automata::{
+ dfa::dense::{Builder, Config},
+ SyntaxConfig,
+ };
+
+ let mut builder = Builder::new();
+ let dfa = builder
+ .syntax(SyntaxConfig::new().case_insensitive(true))
+ .configure(Config::new().anchored(true).minimize(true))
+ .build(&pattern)
+ .map_err(|_| {
+ icu_provider::DataError::custom("Cannot build DFA").with_display_context(&pattern)
+ })?
+ .to_sparse()
+ .map_err(|_| {
+ icu_provider::DataError::custom("Cannot sparsify DFA")
+ .with_display_context(&pattern)
+ })?;
+
+ Ok(Self {
+ dfa_bytes: dfa.to_bytes_native_endian().into(),
+ pattern: Some(pattern),
+ })
+ }
+
+ /// Returns the represented [`DFA`]
+ #[allow(clippy::unwrap_used)] // by invariant
+ pub fn deref(&'data self) -> DFA<&'data [u8]> {
+ // Safe due to struct invariant.
+ unsafe { DFA::from_bytes_unchecked(&self.dfa_bytes).unwrap().0 }
+ }
+}
+
+#[cfg(all(test, feature = "datagen"))]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_serde_dfa() {
+ use regex_automata::dfa::Automaton;
+
+ let matcher = SerdeDFA::new(Cow::Borrowed("abc")).unwrap();
+
+ assert!(matcher.deref().find_earliest_fwd(b"ab").unwrap().is_none());
+ assert!(matcher.deref().find_earliest_fwd(b"abc").unwrap().is_some());
+ assert!(matcher
+ .deref()
+ .find_earliest_fwd(b"abcde")
+ .unwrap()
+ .is_some());
+ assert!(matcher
+ .deref()
+ .find_earliest_fwd(b" abcde")
+ .unwrap()
+ .is_none());
+ }
+
+ #[derive(serde::Deserialize)]
+ struct OptionSerdeDFA<'data>(
+ #[serde(borrow, deserialize_with = "SerdeDFA::maybe_deserialize")] Option<SerdeDFA<'data>>,
+ );
+
+ #[test]
+ #[cfg(target_endian = "little")]
+ fn test_postcard_serialization() {
+ let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
+
+ let mut bytes = postcard::to_stdvec(&matcher).unwrap();
+ assert_eq!(
+ postcard::from_bytes::<OptionSerdeDFA>(&bytes).unwrap().0,
+ Some(matcher)
+ );
+
+ // A corrupted byte leads to an error
+ bytes[17] ^= 255;
+ assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
+ bytes[17] ^= 255;
+
+ // An extra byte leads to an error
+ bytes.insert(123, 40);
+ assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes).is_err());
+ bytes.remove(123);
+
+ // Missing bytes lead to an error
+ assert!(postcard::from_bytes::<OptionSerdeDFA>(&bytes[0..bytes.len() - 5]).is_err());
+ }
+
+ #[test]
+ #[cfg(feature = "serde_human")]
+ fn test_json_serialization() {
+ let matcher = SerdeDFA::new(Cow::Borrowed("abc*")).unwrap();
+
+ let json = serde_json::to_string(&matcher).unwrap();
+ assert_eq!(
+ serde_json::from_str::<OptionSerdeDFA>(&json).unwrap().0,
+ Some(matcher)
+ );
+ assert!(serde_json::from_str::<OptionSerdeDFA>(".*[").is_err());
+ }
+
+ #[test]
+ #[ignore] // https://github.com/rust-lang/rust/issues/98906
+ fn databake() {
+ databake::test_bake!(
+ SerdeDFA,
+ const: unsafe { crate::provider::SerdeDFA::from_dfa_bytes_unchecked(if cfg!(target_endian = "little") {
+ &[1] // TODO: set this when activating the test
+ } else {
+ &[2] // TODO: set this when activating the test
+ })},
+ icu_list
+ );
+ }
+}