14 files changed, 191 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/tests/gen/README.md b/third_party/rust/regex-automata/tests/gen/README.md
new file mode 100644
index 0000000000..59439a11fd
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/README.md
@@ -0,0 +1,65 @@
+This directory contains tests for serialized objects from the regex-automata
+crate. Currently, there are only two supported such objects: dense and sparse
+DFAs.
+
+The idea behind these tests is to commit some serialized objects and run some
+basic tests by deserializing them and running searches and ensuring they are
+correct. We also make sure these are run under Miri, since deserialization is
+one of the biggest places where undefined behavior might occur in this crate
+(at the time of writing).
+
+The main thing we're testing is that the *current* code can still deserialize
+*old* objects correctly. Generally speaking, compatibility extends to semver
+compatible releases of this crate. Beyond that, no promises are made, although
+in practice callers can at least depend on errors occurring. (The serialized
+format always includes a version number, and incompatible changes increment
+that version number such that an error will occur if an unsupported version is
+detected.)
+
+To generate the dense DFAs, I used this command:
+
+```
+$ regex-cli generate serialize dense regex \
+    MULTI_PATTERN_V2 \
+    tests/gen/dense/ \
+    --rustfmt \
+    --safe \
+    --starts-for-each-pattern \
+    --specialize-start-states \
+    --start-kind both \
+    --unicode-word-boundary \
+    --minimize \
+    '\b[a-zA-Z]+\b' \
+    '(?m)^\S+$' \
+    '(?Rm)^\S+$'
+```
+
+And to generate the sparse DFAs, I used this command, which is the same as
+above, but with `s/dense/sparse/g`.
+
+```
+$ regex-cli generate serialize sparse regex \
+    MULTI_PATTERN_V2 \
+    tests/gen/sparse/ \
+    --rustfmt \
+    --safe \
+    --starts-for-each-pattern \
+    --specialize-start-states \
+    --start-kind both \
+    --unicode-word-boundary \
+    --minimize \
+    '\b[a-zA-Z]+\b' \
+    '(?m)^\S+$' \
+    '(?Rm)^\S+$'
+```
+
+The idea is to try to enable as many of the DFA's options as possible in order
+to test that serialization works for all of them.
+
+Arguably we should increase test coverage here, but this is a start. Note
+that in particular, this does not need to test that serialization and
+deserialization correctly roundtrips on its own. Indeed, the normal regex test
+suite has a test that does a serialization round trip for every test supported
+by DFAs. So that has very good coverage. What we're interested in testing here
+is our compatibility promise: do DFAs generated with an older revision of the
+code still deserialize correctly?
diff --git a/third_party/rust/regex-automata/tests/gen/dense/mod.rs b/third_party/rust/regex-automata/tests/gen/dense/mod.rs
new file mode 100644
index 0000000000..b4365d4e19
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/dense/mod.rs
@@ -0,0 +1,22 @@
+use regex_automata::{Input, Match};
+
+mod multi_pattern_v2;
+
+#[test]
+fn multi_pattern_v2() {
+    use multi_pattern_v2::MULTI_PATTERN_V2 as RE;
+
+    assert_eq!(Some(Match::must(0, 0..4)), RE.find("abcd"));
+    assert_eq!(Some(Match::must(0, 2..6)), RE.find("@ abcd @"));
+    assert_eq!(Some(Match::must(1, 0..6)), RE.find("@abcd@"));
+    assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd\n"));
+    assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd wxyz\n"));
+    assert_eq!(Some(Match::must(1, 1..7)), RE.find("\n@abcd@\n"));
+    assert_eq!(Some(Match::must(2, 0..6)), RE.find("@abcd@\r\n"));
+    assert_eq!(Some(Match::must(1, 2..8)), RE.find("\r\n@abcd@"));
+    assert_eq!(Some(Match::must(2, 2..8)), RE.find("\r\n@abcd@\r\n"));
+
+    // Fails because we have heuristic support for Unicode word boundaries
+    // enabled.
+    assert!(RE.try_search(&Input::new(b"\xFF@abcd@\xFF")).is_err());
+}
diff --git a/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2.rs b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2.rs
new file mode 100644
index 0000000000..a95fd204b5
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2.rs
@@ -0,0 +1,43 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize dense regex MULTI_PATTERN_V2 tests/gen/dense/ --rustfmt --safe --starts-for-each-pattern --specialize-start-states --start-kind both --unicode-word-boundary --minimize \b[a-zA-Z]+\b (?m)^\S+$ (?Rm)^\S+$
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{
+    dfa::{dense::DFA, regex::Regex},
+    util::{lazy::Lazy, wire::AlignAs},
+};
+
+pub static MULTI_PATTERN_V2: Lazy<Regex<DFA<&'static [u32]>>> =
+    Lazy::new(|| {
+        let dfafwd = {
+            static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+                _align: [],
+                #[cfg(target_endian = "big")]
+                bytes: *include_bytes!("multi_pattern_v2_fwd.bigendian.dfa"),
+                #[cfg(target_endian = "little")]
+                bytes: *include_bytes!(
+                    "multi_pattern_v2_fwd.littleendian.dfa"
+                ),
+            };
+            DFA::from_bytes(&ALIGNED.bytes)
+                .expect("serialized forward DFA should be valid")
+                .0
+        };
+        let dfarev = {
+            static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+                _align: [],
+                #[cfg(target_endian = "big")]
+                bytes: *include_bytes!("multi_pattern_v2_rev.bigendian.dfa"),
+                #[cfg(target_endian = "little")]
+                bytes: *include_bytes!(
+                    "multi_pattern_v2_rev.littleendian.dfa"
+                ),
+            };
+            DFA::from_bytes(&ALIGNED.bytes)
+                .expect("serialized reverse DFA should be valid")
+                .0
+        };
+        Regex::builder().build_from_dfas(dfafwd, dfarev)
+    });
diff --git a/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa
new file mode 100644
index 0000000000..6d6e040c36
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa
diff --git a/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa
new file mode 100644
index 0000000000..a1f4b3da15
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa
diff --git a/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa
new file mode 100644
index 0000000000..74f74ec2a9
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa
diff --git a/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa
new file mode 100644
index 0000000000..663bdb9ead
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa
diff --git a/third_party/rust/regex-automata/tests/gen/mod.rs b/third_party/rust/regex-automata/tests/gen/mod.rs
new file mode 100644
index 0000000000..960cb4251a
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/mod.rs
@@ -0,0 +1,2 @@
+mod dense;
+mod sparse;
diff --git a/third_party/rust/regex-automata/tests/gen/sparse/mod.rs b/third_party/rust/regex-automata/tests/gen/sparse/mod.rs
new file mode 100644
index 0000000000..b4365d4e19
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/sparse/mod.rs
@@ -0,0 +1,22 @@
+use regex_automata::{Input, Match};
+
+mod multi_pattern_v2;
+
+#[test]
+fn multi_pattern_v2() {
+    use multi_pattern_v2::MULTI_PATTERN_V2 as RE;
+
+    assert_eq!(Some(Match::must(0, 0..4)), RE.find("abcd"));
+    assert_eq!(Some(Match::must(0, 2..6)), RE.find("@ abcd @"));
+    assert_eq!(Some(Match::must(1, 0..6)), RE.find("@abcd@"));
+    assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd\n"));
+    assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd wxyz\n"));
+    assert_eq!(Some(Match::must(1, 1..7)), RE.find("\n@abcd@\n"));
+    assert_eq!(Some(Match::must(2, 0..6)), RE.find("@abcd@\r\n"));
+    assert_eq!(Some(Match::must(1, 2..8)), RE.find("\r\n@abcd@"));
+    assert_eq!(Some(Match::must(2, 2..8)), RE.find("\r\n@abcd@\r\n"));
+
+    // Fails because we have heuristic support for Unicode word boundaries
+    // enabled.
+    assert!(RE.try_search(&Input::new(b"\xFF@abcd@\xFF")).is_err());
+}
diff --git a/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2.rs b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2.rs
new file mode 100644
index 0000000000..911e3f5ddc
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2.rs
@@ -0,0 +1,37 @@
+// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
+//
+//     regex-cli generate serialize sparse regex MULTI_PATTERN_V2 regex-automata/tests/gen/sparse/ --rustfmt --safe --starts-for-each-pattern --specialize-start-states --start-kind both --unicode-word-boundary --minimize \b[a-zA-Z]+\b (?m)^\S+$ (?Rm)^\S+$
+//
+// regex-cli 0.0.1 is available on crates.io.
+
+use regex_automata::{
+    dfa::{regex::Regex, sparse::DFA},
+    util::lazy::Lazy,
+};
+
+pub static MULTI_PATTERN_V2: Lazy<Regex<DFA<&'static [u8]>>> =
+    Lazy::new(|| {
+        let dfafwd = {
+            #[cfg(target_endian = "big")]
+            static BYTES: &'static [u8] =
+                include_bytes!("multi_pattern_v2_fwd.bigendian.dfa");
+            #[cfg(target_endian = "little")]
+            static BYTES: &'static [u8] =
+                include_bytes!("multi_pattern_v2_fwd.littleendian.dfa");
+            DFA::from_bytes(BYTES)
+                .expect("serialized forward DFA should be valid")
+                .0
+        };
+        let dfarev = {
+            #[cfg(target_endian = "big")]
+            static BYTES: &'static [u8] =
+                include_bytes!("multi_pattern_v2_rev.bigendian.dfa");
+            #[cfg(target_endian = "little")]
+            static BYTES: &'static [u8] =
+                include_bytes!("multi_pattern_v2_rev.littleendian.dfa");
+            DFA::from_bytes(BYTES)
+                .expect("serialized reverse DFA should be valid")
+                .0
+        };
+        Regex::builder().build_from_dfas(dfafwd, dfarev)
+    });
diff --git a/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa
new file mode 100644
index 0000000000..aa04f63162
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa
diff --git a/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa
new file mode 100644
index 0000000000..c27d92abe1
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa
diff --git a/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa
new file mode 100644
index 0000000000..89867d30f6
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa
diff --git a/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa
new file mode 100644
index 0000000000..c0ca807f89
--- /dev/null
+++ b/third_party/rust/regex-automata/tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa