summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/dfa
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/rust/regex-automata/src/dfa
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regex-automata/src/dfa')
-rw-r--r--third_party/rust/regex-automata/src/dfa/accel.rs516
-rw-r--r--third_party/rust/regex-automata/src/dfa/automaton.rs2120
-rw-r--r--third_party/rust/regex-automata/src/dfa/dense.rs5139
-rw-r--r--third_party/rust/regex-automata/src/dfa/determinize.rs599
-rw-r--r--third_party/rust/regex-automata/src/dfa/minimize.rs463
-rw-r--r--third_party/rust/regex-automata/src/dfa/mod.rs360
-rw-r--r--third_party/rust/regex-automata/src/dfa/onepass.rs3188
-rw-r--r--third_party/rust/regex-automata/src/dfa/regex.rs871
-rw-r--r--third_party/rust/regex-automata/src/dfa/remapper.rs242
-rw-r--r--third_party/rust/regex-automata/src/dfa/search.rs654
-rw-r--r--third_party/rust/regex-automata/src/dfa/sparse.rs2656
-rw-r--r--third_party/rust/regex-automata/src/dfa/special.rs494
-rw-r--r--third_party/rust/regex-automata/src/dfa/start.rs74
13 files changed, 17376 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/dfa/accel.rs b/third_party/rust/regex-automata/src/dfa/accel.rs
new file mode 100644
index 0000000000..5ea2423dd0
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/accel.rs
@@ -0,0 +1,516 @@
+// This module defines some core types for dealing with accelerated DFA states.
+// Briefly, a DFA state can be "accelerated" if all of its transitions except
+// for a few loop back to itself. This directly implies that the only way out
+// of such a state is if a byte corresponding to one of those non-loopback
+// transitions is found. Such states are often found in simple repetitions in
+// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
+// DFA with regex-cli:
+//
+// $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
+// dense::DFA(
+// D 000000:
+// Q 000001:
+// *000002:
+// A 000003: \x00-` => 3, a => 5, b-\xFF => 3
+// >000004: \x00-` => 3, a => 4, b-\xFF => 3
+// 000005: \x00-\xFF => 2, EOI => 2
+// )
+//
+// In particular, state 3 is accelerated (shown via the 'A' indicator) since
+// the only way to leave that state once entered is to see an 'a' byte. If
+// there is a long run of non-'a' bytes, then using something like 'memchr'
+// to find the next 'a' byte can be significantly faster than just using the
+// standard byte-at-a-time state machine.
+//
+// Unfortunately, this optimization rarely applies when Unicode is enabled.
+// For example, patterns like '[^a]' don't actually match any byte that isn't
+// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
+// 'a'. This makes the state machine much more complex---far beyond a single
+// state---and removes the ability to easily accelerate it. (Because if the
+// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
+//
+// In practice, we only consider accelerating states that have 3 or fewer
+// non-loop transitions. At a certain point, you get diminishing returns, but
+// also because that's what the memchr crate supports. The structures below
+// hard-code this assumption and provide (de)serialization APIs for use inside
+// a DFA.
+//
+// And finally, note that there is some trickery involved in making it very
+// fast to not only check whether a state is accelerated at search time, but
+// also to access the bytes to search for to implement the acceleration itself.
+// dfa/special.rs provides more detail, but the short story is that all
+// accelerated states appear contiguously in a DFA. This means we can represent
+// the ID space of all accelerated DFA states with a single range. So given
+// a state ID, we can determine whether it's accelerated via
+//
+// min_accel_id <= id <= max_accel_id
+//
+// And find its corresponding accelerator with:
+//
+// accels.get((id - min_accel_id) / dfa_stride)
+
+#[cfg(feature = "dfa-build")]
+use alloc::{vec, vec::Vec};
+
+use crate::util::{
+ int::Pointer,
+ memchr,
+ wire::{self, DeserializeError, Endian, SerializeError},
+};
+
+/// The base type used to represent a collection of accelerators.
+///
+/// While an `Accel` is represented as a fixed size array of bytes, a
+/// *collection* of `Accel`s (called `Accels`) is represented internally as a
+/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
+/// fairly low-risk not-safe code, it lets us remove the need for a second type
+/// parameter in the definition of dense::DFA. (Which really wants everything
+/// to be a slice of u32.)
+type AccelTy = u32;
+
+/// The size of the unit of representation for accelerators.
+///
+/// ACCEL_CAP *must* be a multiple of this size.
+const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
+
+/// The maximum length in bytes that a single Accel can be. This is distinct
+/// from the capacity of an accelerator in that the length represents only the
+/// bytes that should be read.
+const ACCEL_LEN: usize = 4;
+
+/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
+/// multiple of 4 (our ID size) and because it gives us a little wiggle room
+/// if we want to support more accel bytes in the future without a breaking
+/// change.
+///
+/// This MUST be a multiple of ACCEL_TY_SIZE.
+const ACCEL_CAP: usize = 8;
+
+/// Search for between 1 and 3 needle bytes in the given haystack, starting the
+/// search at the given position. If `needles` has a length other than 1-3,
+/// then this panics.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn find_fwd(
+ needles: &[u8],
+ haystack: &[u8],
+ at: usize,
+) -> Option<usize> {
+ let bs = needles;
+ let i = match needles.len() {
+ 1 => memchr::memchr(bs[0], &haystack[at..])?,
+ 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
+ 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
+ 0 => panic!("cannot find with empty needles"),
+ n => panic!("invalid needles length: {}", n),
+ };
+ Some(at + i)
+}
+
+/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
+/// starting the search at the given position. If `needles` has a length other
+/// than 1-3, then this panics.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+pub(crate) fn find_rev(
+ needles: &[u8],
+ haystack: &[u8],
+ at: usize,
+) -> Option<usize> {
+ let bs = needles;
+ match needles.len() {
+ 1 => memchr::memrchr(bs[0], &haystack[..at]),
+ 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
+ 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
+ 0 => panic!("cannot find with empty needles"),
+ n => panic!("invalid needles length: {}", n),
+ }
+}
+
+/// Represents the accelerators for all accelerated states in a dense DFA.
+///
+/// The `A` type parameter represents the type of the underlying bytes.
+/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
+#[derive(Clone)]
+pub(crate) struct Accels<A> {
+ /// A length prefixed slice of contiguous accelerators. See the top comment
+ /// in this module for more details on how we can jump from a DFA's state
+ /// ID to an accelerator in this list.
+ ///
+ /// The first 4 bytes always correspond to the number of accelerators
+ /// that follow.
+ accels: A,
+}
+
+#[cfg(feature = "dfa-build")]
+impl Accels<Vec<AccelTy>> {
+ /// Create an empty sequence of accelerators for a DFA.
+ pub fn empty() -> Accels<Vec<AccelTy>> {
+ Accels { accels: vec![0] }
+ }
+
+ /// Add an accelerator to this sequence.
+ ///
+ /// This adds to the accelerator to the end of the sequence and therefore
+ /// should be done in correspondence with its state in the DFA.
+ ///
+ /// This panics if this results in more accelerators than AccelTy::MAX.
+ pub fn add(&mut self, accel: Accel) {
+ self.accels.extend_from_slice(&accel.as_accel_tys());
+ let len = self.len();
+ self.set_len(len + 1);
+ }
+
+ /// Set the number of accelerators in this sequence, which is encoded in
+ /// the first 4 bytes of the underlying bytes.
+ fn set_len(&mut self, new_len: usize) {
+ // The only way an accelerator gets added is if a state exists for
+ // it, and if a state exists, then its index is guaranteed to be
+ // representable by a AccelTy by virtue of the guarantees provided by
+ // StateID.
+ let new_len = AccelTy::try_from(new_len).unwrap();
+ self.accels[0] = new_len;
+ }
+}
+
+impl<'a> Accels<&'a [AccelTy]> {
+ /// Deserialize a sequence of accelerators from the given bytes. If there
+ /// was a problem deserializing, then an error is returned.
+ ///
+ /// This is guaranteed to run in constant time. This does not guarantee
+ /// that every accelerator in the returned collection is valid. Thus,
+ /// accessing one may panic, or not-safe code that relies on accelerators
+ /// being correct my result in UB.
+ ///
+ /// Callers may check the validity of every accelerator with the `validate`
+ /// method.
+ pub fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr().as_usize();
+
+ let (accel_len, _) =
+ wire::try_read_u32_as_usize(slice, "accelerators length")?;
+ // The accelerator length is part of the accel_tys slice that
+ // we deserialize. This is perhaps a bit idiosyncratic. It would
+ // probably be better to split out the length into a real field.
+
+ let accel_tys_len = wire::add(
+ wire::mul(accel_len, 2, "total number of accelerator accel_tys")?,
+ 1,
+ "total number of accel_tys",
+ )?;
+ let accel_tys_bytes_len = wire::mul(
+ ACCEL_TY_SIZE,
+ accel_tys_len,
+ "total number of bytes in accelerators",
+ )?;
+ wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
+ wire::check_alignment::<AccelTy>(slice)?;
+ let accel_tys = &slice[..accel_tys_bytes_len];
+ slice = &slice[accel_tys_bytes_len..];
+ // SAFETY: We've checked the length and alignment above, and since
+ // slice is just bytes and AccelTy is just a u32, we can safely cast to
+ // a slice of &[AccelTy].
+ let accels = unsafe {
+ core::slice::from_raw_parts(
+ accel_tys.as_ptr().cast::<AccelTy>(),
+ accel_tys_len,
+ )
+ };
+ Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
+ }
+}
+
+impl<A: AsRef<[AccelTy]>> Accels<A> {
+ /// Return an owned version of the accelerators.
+ #[cfg(feature = "alloc")]
+ pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
+ Accels { accels: self.accels.as_ref().to_vec() }
+ }
+
+ /// Return a borrowed version of the accelerators.
+ pub fn as_ref(&self) -> Accels<&[AccelTy]> {
+ Accels { accels: self.accels.as_ref() }
+ }
+
+ /// Return the bytes representing the serialization of the accelerators.
+ pub fn as_bytes(&self) -> &[u8] {
+ let accels = self.accels.as_ref();
+ // SAFETY: This is safe because accels is a just a slice of AccelTy,
+ // and u8 always has a smaller alignment.
+ unsafe {
+ core::slice::from_raw_parts(
+ accels.as_ptr().cast::<u8>(),
+ accels.len() * ACCEL_TY_SIZE,
+ )
+ }
+ }
+
+ /// Returns the memory usage, in bytes, of these accelerators.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent all of the accelerators.
+ ///
+ /// This does **not** include the stack size used by this value.
+ pub fn memory_usage(&self) -> usize {
+ self.as_bytes().len()
+ }
+
+ /// Return the bytes to search for corresponding to the accelerator in this
+ /// sequence at index `i`. If no such accelerator exists, then this panics.
+ ///
+ /// The significance of the index is that it should be in correspondence
+ /// with the index of the corresponding DFA. That is, accelerated DFA
+ /// states are stored contiguously in the DFA and have an ordering implied
+ /// by their respective state IDs. The state's index in that sequence
+ /// corresponds to the index of its corresponding accelerator.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn needles(&self, i: usize) -> &[u8] {
+ if i >= self.len() {
+ panic!("invalid accelerator index {}", i);
+ }
+ let bytes = self.as_bytes();
+ let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
+ let len = usize::from(bytes[offset]);
+ &bytes[offset + 1..offset + 1 + len]
+ }
+
+ /// Return the total number of accelerators in this sequence.
+ pub fn len(&self) -> usize {
+ // This should never panic since deserialization checks that the
+ // length can fit into a usize.
+ usize::try_from(self.accels.as_ref()[0]).unwrap()
+ }
+
+ /// Return the accelerator in this sequence at index `i`. If no such
+ /// accelerator exists, then this returns None.
+ ///
+ /// See the docs for `needles` on the significance of the index.
+ fn get(&self, i: usize) -> Option<Accel> {
+ if i >= self.len() {
+ return None;
+ }
+ let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
+ let accel = Accel::from_slice(&self.as_bytes()[offset..])
+ .expect("Accels must contain valid accelerators");
+ Some(accel)
+ }
+
+ /// Returns an iterator of accelerators in this sequence.
+ fn iter(&self) -> IterAccels<'_, A> {
+ IterAccels { accels: self, i: 0 }
+ }
+
+ /// Writes these accelerators to the given byte buffer using the indicated
+ /// endianness. If the given buffer is too small, then an error is
+ /// returned. Upon success, the total number of bytes written is returned.
+ /// The number of bytes written is guaranteed to be a multiple of 8.
+ pub fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ assert_eq!(
+ nwrite % ACCEL_TY_SIZE,
+ 0,
+ "expected accelerator bytes written to be a multiple of {}",
+ ACCEL_TY_SIZE,
+ );
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("accelerators"));
+ }
+
+ // The number of accelerators can never exceed AccelTy::MAX.
+ E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
+ // The actual accelerators are just raw bytes and thus their endianness
+ // is irrelevant. So we can copy them as bytes.
+ dst[ACCEL_TY_SIZE..nwrite]
+ .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
+ Ok(nwrite)
+ }
+
+ /// Validates that every accelerator in this collection can be successfully
+ /// deserialized as a valid accelerator.
+ pub fn validate(&self) -> Result<(), DeserializeError> {
+ for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
+ let _ = Accel::from_slice(chunk)?;
+ }
+ Ok(())
+ }
+
+ /// Returns the total number of bytes written by `write_to`.
+ pub fn write_to_len(&self) -> usize {
+ self.as_bytes().len()
+ }
+}
+
+impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "Accels(")?;
+ let mut list = f.debug_list();
+ for a in self.iter() {
+ list.entry(&a);
+ }
+ list.finish()?;
+ write!(f, ")")
+ }
+}
+
+#[derive(Debug)]
+struct IterAccels<'a, A: AsRef<[AccelTy]>> {
+ accels: &'a Accels<A>,
+ i: usize,
+}
+
+impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
+ type Item = Accel;
+
+ fn next(&mut self) -> Option<Accel> {
+ let accel = self.accels.get(self.i)?;
+ self.i += 1;
+ Some(accel)
+ }
+}
+
+/// Accel represents a structure for determining how to "accelerate" a DFA
+/// state.
+///
+/// Namely, it contains zero or more bytes that must be seen in order for the
+/// DFA to leave the state it is associated with. In practice, the actual range
+/// is 1 to 3 bytes.
+///
+/// The purpose of acceleration is to identify states whose vast majority
+/// of transitions are just loops back to the same state. For example,
+/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
+/// (corresponding to `[^a]+`) where all transitions *except* for `a` and
+/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
+/// looking for the next occurrence of either `a` or `b` instead of explicitly
+/// following transitions. (In this case, `b` transitions to the next state
+/// where as `a` would transition to the dead state.)
+#[derive(Clone)]
+pub(crate) struct Accel {
+ /// The first byte is the length. Subsequent bytes are the accelerated
+ /// bytes.
+ ///
+ /// Note that we make every accelerator 8 bytes as a slightly wasteful
+ /// way of making sure alignment is always correct for state ID sizes of
+ /// 1, 2, 4 and 8. This should be okay since accelerated states aren't
+ /// particularly common, especially when Unicode is enabled.
+ bytes: [u8; ACCEL_CAP],
+}
+
+impl Accel {
+ /// Returns an empty accel, where no bytes are accelerated.
+ #[cfg(feature = "dfa-build")]
+ pub fn new() -> Accel {
+ Accel { bytes: [0; ACCEL_CAP] }
+ }
+
+ /// Returns a verified accelerator derived from the beginning of the given
+ /// slice.
+ ///
+ /// If the slice is not long enough or contains invalid bytes for an
+ /// accelerator, then this returns an error.
+ pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
+ slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
+ let bytes = slice
+ .try_into()
+ .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
+ Accel::from_bytes(bytes)
+ }
+
+ /// Returns a verified accelerator derived from raw bytes.
+ ///
+ /// If the given bytes are invalid, then this returns an error.
+ fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
+ if usize::from(bytes[0]) >= ACCEL_LEN {
+ return Err(DeserializeError::generic(
+ "accelerator bytes cannot have length more than 3",
+ ));
+ }
+ Ok(Accel::from_bytes_unchecked(bytes))
+ }
+
+ /// Returns an accelerator derived from raw bytes.
+ ///
+ /// This does not check whether the given bytes are valid. Invalid bytes
+ /// cannot sacrifice memory safety, but may result in panics or silent
+ /// logic bugs.
+ fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
+ Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
+ }
+
+ /// Attempts to add the given byte to this accelerator. If the accelerator
+ /// is already full or thinks the byte is a poor accelerator, then this
+ /// returns false. Otherwise, returns true.
+ ///
+ /// If the given byte is already in this accelerator, then it panics.
+ #[cfg(feature = "dfa-build")]
+ pub fn add(&mut self, byte: u8) -> bool {
+ if self.len() >= 3 {
+ return false;
+ }
+ // As a special case, we totally reject trying to accelerate a state
+ // with an ASCII space. In most cases, it occurs very frequently, and
+ // tends to result in worse overall performance.
+ if byte == b' ' {
+ return false;
+ }
+ assert!(
+ !self.contains(byte),
+ "accelerator already contains {:?}",
+ crate::util::escape::DebugByte(byte)
+ );
+ self.bytes[self.len() + 1] = byte;
+ self.bytes[0] += 1;
+ true
+ }
+
+ /// Return the number of bytes in this accelerator.
+ pub fn len(&self) -> usize {
+ usize::from(self.bytes[0])
+ }
+
+ /// Returns true if and only if there are no bytes in this accelerator.
+ #[cfg(feature = "dfa-build")]
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the slice of bytes to accelerate.
+ ///
+ /// If this accelerator is empty, then this returns an empty slice.
+ fn needles(&self) -> &[u8] {
+ &self.bytes[1..1 + self.len()]
+ }
+
+ /// Returns true if and only if this accelerator will accelerate the given
+ /// byte.
+ #[cfg(feature = "dfa-build")]
+ fn contains(&self, byte: u8) -> bool {
+ self.needles().iter().position(|&b| b == byte).is_some()
+ }
+
+ /// Returns the accelerator bytes as an array of AccelTys.
+ #[cfg(feature = "dfa-build")]
+ fn as_accel_tys(&self) -> [AccelTy; 2] {
+ assert_eq!(ACCEL_CAP, 8);
+ // These unwraps are OK since ACCEL_CAP is set to 8.
+ let first =
+ AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
+ let second =
+ AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
+ [first, second]
+ }
+}
+
+impl core::fmt::Debug for Accel {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "Accel(")?;
+ let mut set = f.debug_set();
+ for &b in self.needles() {
+ set.entry(&crate::util::escape::DebugByte(b));
+ }
+ set.finish()?;
+ write!(f, ")")
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/automaton.rs b/third_party/rust/regex-automata/src/dfa/automaton.rs
new file mode 100644
index 0000000000..7e2be9a151
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/automaton.rs
@@ -0,0 +1,2120 @@
+#[cfg(feature = "alloc")]
+use crate::util::search::PatternSet;
+use crate::{
+ dfa::search,
+ util::{
+ empty,
+ prefilter::Prefilter,
+ primitives::{PatternID, StateID},
+ search::{Anchored, HalfMatch, Input, MatchError},
+ },
+};
+
+/// A trait describing the interface of a deterministic finite automaton (DFA).
+///
+/// The complexity of this trait probably means that it's unlikely for others
+/// to implement it. The primary purpose of the trait is to provide for a way
+/// of abstracting over different types of DFAs. In this crate, that means
+/// dense DFAs and sparse DFAs. (Dense DFAs are fast but memory hungry, where
+/// as sparse DFAs are slower but come with a smaller memory footprint. But
+/// they otherwise provide exactly equivalent expressive power.) For example, a
+/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) is generic over this trait.
+///
+/// Normally, a DFA's execution model is very simple. You might have a single
+/// start state, zero or more final or "match" states and a function that
+/// transitions from one state to the next given the next byte of input.
+/// Unfortunately, the interface described by this trait is significantly
+/// more complicated than this. The complexity has a number of different
+/// reasons, mostly motivated by performance, functionality or space savings:
+///
+/// * A DFA can search for multiple patterns simultaneously. This
+/// means extra information is returned when a match occurs. Namely,
+/// a match is not just an offset, but an offset plus a pattern ID.
+/// [`Automaton::pattern_len`] returns the number of patterns compiled into
+/// the DFA, [`Automaton::match_len`] returns the total number of patterns
+/// that match in a particular state and [`Automaton::match_pattern`] permits
+/// iterating over the patterns that match in a particular state.
+/// * A DFA can have multiple start states, and the choice of which start
+/// state to use depends on the content of the string being searched and
+/// position of the search, as well as whether the search is an anchored
+/// search for a specific pattern in the DFA. Moreover, computing the start
+/// state also depends on whether you're doing a forward or a reverse search.
+/// [`Automaton::start_state_forward`] and [`Automaton::start_state_reverse`]
+/// are used to compute the start state for forward and reverse searches,
+/// respectively.
+/// * All matches are delayed by one byte to support things like `$` and `\b`
+/// at the end of a pattern. Therefore, every use of a DFA is required to use
+/// [`Automaton::next_eoi_state`]
+/// at the end of the search to compute the final transition.
+/// * For optimization reasons, some states are treated specially. Every
+/// state is either special or not, which can be determined via the
+/// [`Automaton::is_special_state`] method. If it's special, then the state
+/// must be at least one of a few possible types of states. (Note that some
+/// types can overlap, for example, a match state can also be an accel state.
+/// But some types can't. If a state is a dead state, then it can never be any
+/// other type of state.) Those types are:
+/// * A dead state. A dead state means the DFA will never enter a match
+/// state. This can be queried via the [`Automaton::is_dead_state`] method.
+/// * A quit state. A quit state occurs if the DFA had to stop the search
+/// prematurely for some reason. This can be queried via the
+/// [`Automaton::is_quit_state`] method.
+/// * A match state. A match state occurs when a match is found. When a DFA
+/// enters a match state, the search may stop immediately (when looking
+/// for the earliest match), or it may continue to find the leftmost-first
+/// match. This can be queried via the [`Automaton::is_match_state`]
+/// method.
+/// * A start state. A start state is where a search begins. For every
+/// search, there is exactly one start state that is used, however, a
+/// DFA may contain many start states. When the search is in a start
+/// state, it may use a prefilter to quickly skip to candidate matches
+/// without executing the DFA on every byte. This can be queried via the
+/// [`Automaton::is_start_state`] method.
+/// * An accel state. An accel state is a state that is accelerated.
+/// That is, it is a state where _most_ of its transitions loop back to
+/// itself and only a small number of transitions lead to other states.
+/// This kind of state is said to be accelerated because a search routine
+/// can quickly look for the bytes leading out of the state instead of
+/// continuing to execute the DFA on each byte. This can be queried via the
+/// [`Automaton::is_accel_state`] method. And the bytes that lead out of
+/// the state can be queried via the [`Automaton::accelerator`] method.
+///
+/// There are a number of provided methods on this trait that implement
+/// efficient searching (for forwards and backwards) with a DFA using
+/// all of the above features of this trait. In particular, given the
+/// complexity of all these features, implementing a search routine in
+/// this trait can be a little subtle. With that said, it is possible to
+/// somewhat simplify the search routine. For example, handling accelerated
+/// states is strictly optional, since it is always correct to assume that
+/// `Automaton::is_accel_state` returns false. However, one complex part of
+/// writing a search routine using this trait is handling the 1-byte delay of a
+/// match. That is not optional.
+///
+/// # Safety
+///
+/// This trait is not safe to implement so that code may rely on the
+/// correctness of implementations of this trait to avoid undefined behavior.
+/// The primary correctness guarantees are:
+///
+/// * `Automaton::start_state` always returns a valid state ID or an error or
+/// panics.
+/// * `Automaton::next_state`, when given a valid state ID, always returns
+/// a valid state ID for all values of `anchored` and `byte`, or otherwise
+/// panics.
+///
+/// In general, the rest of the methods on `Automaton` need to uphold their
+/// contracts as well. For example, `Automaton::is_dead` should only returns
+/// true if the given state ID is actually a dead state.
+pub unsafe trait Automaton {
+ /// Transitions from the current state to the next state, given the next
+ /// byte of input.
+ ///
+ /// Implementations must guarantee that the returned ID is always a valid
+ /// ID when `current` refers to a valid ID. Moreover, the transition
+ /// function must be defined for all possible values of `input`.
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid ID.
+ /// However, if the caller provides an invalid ID then this must never
+ /// sacrifice memory safety.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a DFA for a given haystack
+ /// by using the `next_state` method.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, Input};
+ ///
+ /// let dfa = dense::DFA::new(r"[a-z]+r")?;
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// state = dfa.next_state(state, b);
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+ /// // special "EOI" transition at the end of the search.
+ /// state = dfa.next_eoi_state(state);
+ /// assert!(dfa.is_match_state(state));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn next_state(&self, current: StateID, input: u8) -> StateID;
+
+ /// Transitions from the current state to the next state, given the next
+ /// byte of input.
+ ///
+ /// Unlike [`Automaton::next_state`], implementations may implement this
+ /// more efficiently by assuming that the `current` state ID is valid.
+ /// Typically, this manifests by eliding bounds checks.
+ ///
+ /// # Safety
+ ///
+ /// Callers of this method must guarantee that `current` refers to a valid
+ /// state ID. If `current` is not a valid state ID for this automaton, then
+ /// calling this routine may result in undefined behavior.
+ ///
+ /// If `current` is valid, then implementations must guarantee that the ID
+ /// returned is valid for all possible values of `input`.
+ unsafe fn next_state_unchecked(
+ &self,
+ current: StateID,
+ input: u8,
+ ) -> StateID;
+
+ /// Transitions from the current state to the next state for the special
+ /// EOI symbol.
+ ///
+ /// Implementations must guarantee that the returned ID is always a valid
+ /// ID when `current` refers to a valid ID.
+ ///
+ /// This routine must be called at the end of every search in a correct
+ /// implementation of search. Namely, DFAs in this crate delay matches
+ /// by one byte in order to support look-around operators. Thus, after
+ /// reaching the end of a haystack, a search implementation must follow one
+ /// last EOI transition.
+ ///
+ /// It is best to think of EOI as an additional symbol in the alphabet of
+ /// a DFA that is distinct from every other symbol. That is, the alphabet
+ /// of DFAs in this crate has a logical size of 257 instead of 256, where
+ /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
+ /// physical alphabet size may be smaller because of alphabet compression
+ /// via equivalence classes, but EOI is always represented somehow in the
+ /// alphabet.)
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid ID.
+ /// However, if the caller provides an invalid ID then this must never
+ /// sacrifice memory safety.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a DFA for a given haystack,
+ /// and then finishing the search with the final EOI transition.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, Input};
+ ///
+ /// let dfa = dense::DFA::new(r"[a-z]+r")?;
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// //
+ /// // The unwrap is OK because we aren't requesting a start state for a
+ /// // specific pattern.
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// state = dfa.next_state(state, b);
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk
+ /// // the special "EOI" transition at the end of the search. Without this
+ /// // final transition, the assert below will fail since the DFA will not
+ /// // have entered a match state yet!
+ /// state = dfa.next_eoi_state(state);
+ /// assert!(dfa.is_match_state(state));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn next_eoi_state(&self, current: StateID) -> StateID;
+
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// forward search.
+ ///
+ /// Unlike typical DFA implementations, the start state for DFAs in this
+ /// crate is dependent on a few different factors:
+ ///
+ /// * The [`Anchored`] mode of the search. Unanchored, anchored and
+ /// anchored searches for a specific [`PatternID`] all use different start
+ /// states.
+ /// * The position at which the search begins, via [`Input::start`]. This
+ /// and the byte immediately preceding the start of the search (if one
+ /// exists) influence which look-behind assertions are true at the start
+ /// of the search. This in turn influences which start state is selected.
+ /// * Whether the search is a forward or reverse search. This routine can
+ /// only be used for forward searches.
+ ///
+ /// # Errors
+ ///
+ /// This may return a [`MatchError`] if the search needs to give up
+ /// when determining the start state (for example, if it sees a "quit"
+ /// byte). This can also return an error if the given `Input` contains an
+ /// unsupported [`Anchored`] configuration.
+ fn start_state_forward(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError>;
+
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// reverse search.
+ ///
+ /// Unlike typical DFA implementations, the start state for DFAs in this
+ /// crate is dependent on a few different factors:
+ ///
+ /// * The [`Anchored`] mode of the search. Unanchored, anchored and
+ /// anchored searches for a specific [`PatternID`] all use different start
+ /// states.
+ /// * The position at which the search begins, via [`Input::start`]. This
+ /// and the byte immediately preceding the start of the search (if one
+ /// exists) influence which look-behind assertions are true at the start
+ /// of the search. This in turn influences which start state is selected.
+ /// * Whether the search is a forward or reverse search. This routine can
+ /// only be used for reverse searches.
+ ///
+ /// # Errors
+ ///
+ /// This may return a [`MatchError`] if the search needs to give up
+ /// when determining the start state (for example, if it sees a "quit"
+ /// byte). This can also return an error if the given `Input` contains an
+ /// unsupported [`Anchored`] configuration.
+ fn start_state_reverse(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError>;
+
+ /// If this DFA has a universal starting state for the given anchor mode
+ /// and the DFA supports universal starting states, then this returns that
+ /// state's identifier.
+ ///
+ /// A DFA is said to have a universal starting state when the starting
+ /// state is invariant with respect to the haystack. Usually, the starting
+ /// state is chosen depending on the bytes immediately surrounding the
+ /// starting position of a search. However, the starting state only differs
+ /// when one or more of the patterns in the DFA have look-around assertions
+ /// in its prefix.
+ ///
+ /// Stated differently, if none of the patterns in a DFA have look-around
+ /// assertions in their prefix, then the DFA has a universal starting state
+ /// and _may_ be returned by this method.
+ ///
+ /// It is always correct for implementations to return `None`, and indeed,
+ /// this is what the default implementation does. When this returns `None`,
+ /// callers must use either `start_state_forward` or `start_state_reverse`
+ /// to get the starting state.
+ ///
+ /// # Use case
+ ///
+ /// There are a few reasons why one might want to use this:
+ ///
+ /// * If you know your regex patterns have no look-around assertions in
+ /// their prefix, then calling this routine is likely cheaper and perhaps
+ /// more semantically meaningful.
+ /// * When implementing prefilter support in a DFA regex implementation,
+ /// it is necessary to re-compute the start state after a candidate
+ /// is returned from the prefilter. However, this is only needed when
+ /// there isn't a universal start state. When one exists, one can avoid
+ /// re-computing the start state.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense::DFA},
+ /// Anchored,
+ /// };
+ ///
+ /// // There are no look-around assertions in the prefixes of any of the
+ /// // patterns, so we get a universal start state.
+ /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+$", "[A-Z]+"])?;
+ /// assert!(dfa.universal_start_state(Anchored::No).is_some());
+ /// assert!(dfa.universal_start_state(Anchored::Yes).is_some());
+ ///
+ /// // One of the patterns has a look-around assertion in its prefix,
+ /// // so this means there is no longer a universal start state.
+ /// let dfa = DFA::new_many(&["[0-9]+", "^[a-z]+$", "[A-Z]+"])?;
+ /// assert!(!dfa.universal_start_state(Anchored::No).is_some());
+ /// assert!(!dfa.universal_start_state(Anchored::Yes).is_some());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn universal_start_state(&self, _mode: Anchored) -> Option<StateID> {
+ None
+ }
+
+ /// Returns true if and only if the given identifier corresponds to a
+ /// "special" state. A special state is one or more of the following:
+ /// a dead state, a quit state, a match state, a start state or an
+ /// accelerated state.
+ ///
+ /// A correct implementation _may_ always return false for states that
+ /// are either start states or accelerated states, since that information
+ /// is only intended to be used for optimization purposes. Correct
+ /// implementations must return true if the state is a dead, quit or match
+ /// state. This is because search routines using this trait must be able
+ /// to rely on `is_special_state` as an indicator that a state may need
+ /// special treatment. (For example, when a search routine sees a dead
+ /// state, it must terminate.)
+ ///
+ /// This routine permits search implementations to use a single branch to
+ /// check whether a state needs special attention before executing the next
+ /// transition. The example below shows how to do this.
+ ///
+ /// # Example
+ ///
+ /// This example shows how `is_special_state` can be used to implement a
+ /// correct search routine with minimal branching. In particular, this
+ /// search routine implements "leftmost" matching, which means that it
+ /// doesn't immediately stop once a match is found. Instead, it continues
+ /// until it reaches a dead state.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch, MatchError, Input,
+ /// };
+ ///
+ /// fn find<A: Automaton>(
+ /// dfa: &A,
+ /// haystack: &[u8],
+ /// ) -> Result<Option<HalfMatch>, MatchError> {
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack. Note that start states can never
+ /// // be match states (since DFAs in this crate delay matches by 1
+ /// // byte), so we don't need to check if the start state is a match.
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
+ /// let mut last_match = None;
+ /// // Walk all the bytes in the haystack. We can quit early if we see
+ /// // a dead or a quit state. The former means the automaton will
+ /// // never transition to any other state. The latter means that the
+ /// // automaton entered a condition in which its search failed.
+ /// for (i, &b) in haystack.iter().enumerate() {
+ /// state = dfa.next_state(state, b);
+ /// if dfa.is_special_state(state) {
+ /// if dfa.is_match_state(state) {
+ /// last_match = Some(HalfMatch::new(
+ /// dfa.match_pattern(state, 0),
+ /// i,
+ /// ));
+ /// } else if dfa.is_dead_state(state) {
+ /// return Ok(last_match);
+ /// } else if dfa.is_quit_state(state) {
+ /// // It is possible to enter into a quit state after
+ /// // observing a match has occurred. In that case, we
+ /// // should return the match instead of an error.
+ /// if last_match.is_some() {
+ /// return Ok(last_match);
+ /// }
+ /// return Err(MatchError::quit(b, i));
+ /// }
+ /// // Implementors may also want to check for start or accel
+ /// // states and handle them differently for performance
+ /// // reasons. But it is not necessary for correctness.
+ /// }
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk
+ /// // the special "EOI" transition at the end of the search.
+ /// state = dfa.next_eoi_state(state);
+ /// if dfa.is_match_state(state) {
+ /// last_match = Some(HalfMatch::new(
+ /// dfa.match_pattern(state, 0),
+ /// haystack.len(),
+ /// ));
+ /// }
+ /// Ok(last_match)
+ /// }
+ ///
+ /// // We use a greedy '+' operator to show how the search doesn't just
+ /// // stop once a match is detected. It continues extending the match.
+ /// // Using '[a-z]+?' would also work as expected and stop the search
+ /// // early. Greediness is built into the automaton.
+ /// let dfa = dense::DFA::new(r"[a-z]+")?;
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let mat = find(&dfa, haystack)?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 10);
+ ///
+ /// // Here's another example that tests our handling of the special EOI
+ /// // transition. This will fail to find a match if we don't call
+ /// // 'next_eoi_state' at the end of the search since the match isn't
+ /// // found until the final byte in the haystack.
+ /// let dfa = dense::DFA::new(r"[0-9]{4}")?;
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let mat = find(&dfa, haystack)?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 15);
+ ///
+ /// // And note that our search implementation above automatically works
+ /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
+ /// // the appropriate pattern ID for us.
+ /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let mat = find(&dfa, haystack)?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 1);
+ /// assert_eq!(mat.offset(), 3);
+ /// let mat = find(&dfa, &haystack[3..])?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 7);
+ /// let mat = find(&dfa, &haystack[10..])?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 1);
+ /// assert_eq!(mat.offset(), 5);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn is_special_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a dead
+ /// state. When a DFA enters a dead state, it is impossible to leave. That
+ /// is, every transition on a dead state by definition leads back to the
+ /// same dead state.
+ ///
+ /// In practice, the dead state always corresponds to the identifier `0`.
+ /// Moreover, in practice, there is only one dead state.
+ ///
+ /// The existence of a dead state is not strictly required in the classical
+ /// model of finite state machines, where one generally only cares about
+ /// the question of whether an input sequence matches or not. Dead states
+ /// are not needed to answer that question, since one can immediately quit
+ /// as soon as one enters a final or "match" state. However, we don't just
+ /// care about matches but also care about the location of matches, and
+ /// more specifically, care about semantics like "greedy" matching.
+ ///
+ /// For example, given the pattern `a+` and the input `aaaz`, the dead
+ /// state won't be entered until the state machine reaches `z` in the
+ /// input, at which point, the search routine can quit. But without the
+ /// dead state, the search routine wouldn't know when to quit. In a
+ /// classical representation, the search routine would stop after seeing
+ /// the first `a` (which is when the search would enter a match state). But
+ /// this wouldn't implement "greedy" matching where `a+` matches as many
+ /// `a`'s as possible.
+ ///
+ /// # Example
+ ///
+ /// See the example for [`Automaton::is_special_state`] for how to use this
+ /// method correctly.
+ fn is_dead_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a quit
+ /// state. A quit state is like a dead state (it has no transitions other
+ /// than to itself), except it indicates that the DFA failed to complete
+ /// the search. When this occurs, callers can neither accept or reject that
+ /// a match occurred.
+ ///
+ /// In practice, the quit state always corresponds to the state immediately
+ /// following the dead state. (Which is not usually represented by `1`,
+ /// since state identifiers are pre-multiplied by the state machine's
+ /// alphabet stride, and the alphabet stride varies between DFAs.)
+ ///
+ /// The typical way in which a quit state can occur is when heuristic
+ /// support for Unicode word boundaries is enabled via the
+ /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary)
+ /// option. But other options, like the lower level
+ /// [`dense::Config::quit`](crate::dfa::dense::Config::quit)
+ /// configuration, can also result in a quit state being entered. The
+ /// purpose of the quit state is to provide a way to execute a fast DFA
+ /// in common cases while delegating to slower routines when the DFA quits.
+ ///
+ /// The default search implementations provided by this crate will return a
+ /// [`MatchError::quit`] error when a quit state is entered.
+ ///
+ /// # Example
+ ///
+ /// See the example for [`Automaton::is_special_state`] for how to use this
+ /// method correctly.
+ fn is_quit_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to a
+ /// match state. A match state is also referred to as a "final" state and
+ /// indicates that a match has been found.
+ ///
+ /// If all you care about is whether a particular pattern matches in the
+ /// input sequence, then a search routine can quit early as soon as the
+ /// machine enters a match state. However, if you're looking for the
+ /// standard "leftmost-first" match location, then search _must_ continue
+ /// until either the end of the input or until the machine enters a dead
+ /// state. (Since either condition implies that no other useful work can
+ /// be done.) Namely, when looking for the location of a match, then
+ /// search implementations should record the most recent location in
+ /// which a match state was entered, but otherwise continue executing the
+ /// search as normal. (The search may even leave the match state.) Once
+ /// the termination condition is reached, the most recently recorded match
+ /// location should be returned.
+ ///
+ /// Finally, one additional power given to match states in this crate
+ /// is that they are always associated with a specific pattern in order
+ /// to support multi-DFAs. See [`Automaton::match_pattern`] for more
+ /// details and an example for how to query the pattern associated with a
+ /// particular match state.
+ ///
+ /// # Example
+ ///
+ /// See the example for [`Automaton::is_special_state`] for how to use this
+ /// method correctly.
+ fn is_match_state(&self, id: StateID) -> bool;
+
+ /// Returns true only if the given identifier corresponds to a start
+ /// state
+ ///
+ /// A start state is a state in which a DFA begins a search.
+ /// All searches begin in a start state. Moreover, since all matches are
+ /// delayed by one byte, a start state can never be a match state.
+ ///
+ /// The main role of a start state is, as mentioned, to be a starting
+ /// point for a DFA. This starting point is determined via one of
+ /// [`Automaton::start_state_forward`] or
+ /// [`Automaton::start_state_reverse`], depending on whether one is doing
+ /// a forward or a reverse search, respectively.
+ ///
+ /// A secondary use of start states is for prefix acceleration. Namely,
+ /// while executing a search, if one detects that you're in a start state,
+ /// then it may be faster to look for the next match of a prefix of the
+ /// pattern, if one exists. If a prefix exists and since all matches must
+ /// begin with that prefix, then skipping ahead to occurrences of that
+ /// prefix may be much faster than executing the DFA.
+ ///
+ /// As mentioned in the documentation for
+ /// [`is_special_state`](Automaton::is_special_state) implementations
+ /// _may_ always return false, even if the given identifier is a start
+ /// state. This is because knowing whether a state is a start state or not
+ /// is not necessary for correctness and is only treated as a potential
+ /// performance optimization. (For example, the implementations of this
+ /// trait in this crate will only return true when the given identifier
+ /// corresponds to a start state and when [specialization of start
+ /// states](crate::dfa::dense::Config::specialize_start_states) was enabled
+ /// during DFA construction. If start state specialization is disabled
+ /// (which is the default), then this method will always return false.)
+ ///
+ /// # Example
+ ///
+ /// This example shows how to implement your own search routine that does
+ /// a prefix search whenever the search enters a start state.
+ ///
+ /// Note that you do not need to implement your own search routine
+ /// to make use of prefilters like this. The search routines
+ /// provided by this crate already implement prefilter support via
+ /// the [`Prefilter`](crate::util::prefilter::Prefilter) trait.
+ /// A prefilter can be added to your search configuration with
+ /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter) for
+ /// dense and sparse DFAs in this crate.
+ ///
+ /// This example is meant to show how you might deal with prefilters in a
+ /// simplified case if you are implementing your own search routine.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch, MatchError, Input,
+ /// };
+ ///
+ /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option<usize> {
+ /// // Would be faster to use the memchr crate, but this is still
+ /// // faster than running through the DFA.
+ /// slice[at..].iter().position(|&b| b == byte).map(|i| at + i)
+ /// }
+ ///
+ /// fn find<A: Automaton>(
+ /// dfa: &A,
+ /// haystack: &[u8],
+ /// prefix_byte: Option<u8>,
+ /// ) -> Result<Option<HalfMatch>, MatchError> {
+ /// // See the Automaton::is_special_state example for similar code
+ /// // with more comments.
+ ///
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
+ /// let mut last_match = None;
+ /// let mut pos = 0;
+ /// while pos < haystack.len() {
+ /// let b = haystack[pos];
+ /// state = dfa.next_state(state, b);
+ /// pos += 1;
+ /// if dfa.is_special_state(state) {
+ /// if dfa.is_match_state(state) {
+ /// last_match = Some(HalfMatch::new(
+ /// dfa.match_pattern(state, 0),
+ /// pos - 1,
+ /// ));
+ /// } else if dfa.is_dead_state(state) {
+ /// return Ok(last_match);
+ /// } else if dfa.is_quit_state(state) {
+ /// // It is possible to enter into a quit state after
+ /// // observing a match has occurred. In that case, we
+ /// // should return the match instead of an error.
+ /// if last_match.is_some() {
+ /// return Ok(last_match);
+ /// }
+ /// return Err(MatchError::quit(b, pos - 1));
+ /// } else if dfa.is_start_state(state) {
+ /// // If we're in a start state and know all matches begin
+ /// // with a particular byte, then we can quickly skip to
+ /// // candidate matches without running the DFA through
+ /// // every byte inbetween.
+ /// if let Some(prefix_byte) = prefix_byte {
+ /// pos = match find_byte(haystack, pos, prefix_byte) {
+ /// Some(pos) => pos,
+ /// None => break,
+ /// };
+ /// }
+ /// }
+ /// }
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk
+ /// // the special "EOI" transition at the end of the search.
+ /// state = dfa.next_eoi_state(state);
+ /// if dfa.is_match_state(state) {
+ /// last_match = Some(HalfMatch::new(
+ /// dfa.match_pattern(state, 0),
+ /// haystack.len(),
+ /// ));
+ /// }
+ /// Ok(last_match)
+ /// }
+ ///
+ /// // In this example, it's obvious that all occurrences of our pattern
+ /// // begin with 'Z', so we pass in 'Z'. Note also that we need to
+ /// // enable start state specialization, or else it won't be possible to
+ /// // detect start states during a search. ('is_start_state' would always
+ /// // return false.)
+ /// let dfa = dense::DFA::builder()
+ /// .configure(dense::DFA::config().specialize_start_states(true))
+ /// .build(r"Z[a-z]+")?;
+ /// let haystack = "123 foobar Zbaz quux".as_bytes();
+ /// let mat = find(&dfa, haystack, Some(b'Z'))?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 15);
+ ///
+ /// // But note that we don't need to pass in a prefix byte. If we don't,
+ /// // then the search routine does no acceleration.
+ /// let mat = find(&dfa, haystack, None)?.unwrap();
+ /// assert_eq!(mat.pattern().as_usize(), 0);
+ /// assert_eq!(mat.offset(), 15);
+ ///
+ /// // However, if we pass an incorrect byte, then the prefix search will
+ /// // result in incorrect results.
+ /// assert_eq!(find(&dfa, haystack, Some(b'X'))?, None);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn is_start_state(&self, id: StateID) -> bool;
+
+ /// Returns true if and only if the given identifier corresponds to an
+ /// accelerated state.
+ ///
+ /// An accelerated state is a special optimization
+ /// trick implemented by this crate. Namely, if
+ /// [`dense::Config::accelerate`](crate::dfa::dense::Config::accelerate) is
+ /// enabled (and it is by default), then DFAs generated by this crate will
+ /// tag states meeting certain characteristics as accelerated. States meet
+ /// this criteria whenever most of their transitions are self-transitions.
+ /// That is, transitions that loop back to the same state. When a small
+ /// number of transitions aren't self-transitions, then it follows that
+ /// there are only a small number of bytes that can cause the DFA to leave
+ /// that state. Thus, there is an opportunity to look for those bytes
+ /// using more optimized routines rather than continuing to run through
+ /// the DFA. This trick is similar to the prefilter idea described in
+ /// the documentation of [`Automaton::is_start_state`] with two main
+ /// differences:
+ ///
+ /// 1. It is more limited since acceleration only applies to single bytes.
+ /// This means states are rarely accelerated when Unicode mode is enabled
+ /// (which is enabled by default).
+ /// 2. It can occur anywhere in the DFA, which increases optimization
+ /// opportunities.
+ ///
+ /// Like the prefilter idea, the main downside (and a possible reason to
+ /// disable it) is that it can lead to worse performance in some cases.
+ /// Namely, if a state is accelerated for very common bytes, then the
+ /// overhead of checking for acceleration and using the more optimized
+ /// routines to look for those bytes can cause overall performance to be
+ /// worse than if acceleration wasn't enabled at all.
+ ///
+ /// A simple example of a regex that has an accelerated state is
+ /// `(?-u)[^a]+a`. Namely, the `[^a]+` sub-expression gets compiled down
+ /// into a single state where all transitions except for `a` loop back to
+ /// itself, and where `a` is the only transition (other than the special
+ /// EOI transition) that goes to some other state. Thus, this state can
+ /// be accelerated and implemented more efficiently by calling an
+ /// optimized routine like `memchr` with `a` as the needle. Notice that
+ /// the `(?-u)` to disable Unicode is necessary here, as without it,
+ /// `[^a]` will match any UTF-8 encoding of any Unicode scalar value other
+ /// than `a`. This more complicated expression compiles down to many DFA
+ /// states and the simple acceleration optimization is no longer available.
+ ///
+ /// Typically, this routine is used to guard calls to
+ /// [`Automaton::accelerator`], which returns the accelerated bytes for
+ /// the specified state.
+ fn is_accel_state(&self, id: StateID) -> bool;
+
+ /// Returns the total number of patterns compiled into this DFA.
+ ///
+ /// In the case of a DFA that contains no patterns, this must return `0`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the pattern length for a DFA that never matches:
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense::DFA};
+ ///
+ /// let dfa: DFA<Vec<u32>> = DFA::never_match()?;
+ /// assert_eq!(dfa.pattern_len(), 0);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And another example for a DFA that matches at every position:
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense::DFA};
+ ///
+ /// let dfa: DFA<Vec<u32>> = DFA::always_match()?;
+ /// assert_eq!(dfa.pattern_len(), 1);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And finally, a DFA that was constructed from multiple patterns:
+ ///
+ /// ```
+ /// use regex_automata::dfa::{Automaton, dense::DFA};
+ ///
+ /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(dfa.pattern_len(), 3);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn pattern_len(&self) -> usize;
+
+ /// Returns the total number of patterns that match in this state.
+ ///
+ /// If the given state is not a match state, then implementations may
+ /// panic.
+ ///
+ /// If the DFA was compiled with one pattern, then this must necessarily
+ /// always return `1` for all match states.
+ ///
+ /// Implementations must guarantee that [`Automaton::match_pattern`] can be
+ /// called with indices up to (but not including) the length returned by
+ /// this routine without panicking.
+ ///
+ /// # Panics
+ ///
+ /// Implementations are permitted to panic if the provided state ID does
+ /// not correspond to a match state.
+ ///
+ /// # Example
+ ///
+ /// This example shows a simple instance of implementing overlapping
+ /// matches. In particular, it shows not only how to determine how many
+ /// patterns have matched in a particular state, but also how to access
+ /// which specific patterns have matched.
+ ///
+ /// Notice that we must use
+ /// [`MatchKind::All`](crate::MatchKind::All)
+ /// when building the DFA. If we used
+ /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
+ /// instead, then the DFA would not be constructed in a way that
+ /// supports overlapping matches. (It would only report a single pattern
+ /// that matches at any particular point in time.)
+ ///
+ /// Another thing to take note of is the patterns used and the order in
+ /// which the pattern IDs are reported. In the example below, pattern `3`
+ /// is yielded first. Why? Because it corresponds to the match that
+ /// appears first. Namely, the `@` symbol is part of `\S+` but not part
+ /// of any of the other patterns. Since the `\S+` pattern has a match that
+ /// starts to the left of any other pattern, its ID is returned before any
+ /// other.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchKind};
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().match_kind(MatchKind::All))
+ /// .build_many(&[
+ /// r"[[:word:]]+", r"[a-z]+", r"[A-Z]+", r"[[:^space:]]+",
+ /// ])?;
+ /// let haystack = "@bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut state = dfa.start_state_forward(&Input::new(haystack))?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// state = dfa.next_state(state, b);
+ /// }
+ /// state = dfa.next_eoi_state(state);
+ ///
+ /// assert!(dfa.is_match_state(state));
+ /// assert_eq!(dfa.match_len(state), 3);
+ /// // The following calls are guaranteed to not panic since `match_len`
+ /// // returned `3` above.
+ /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3);
+ /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0);
+ /// assert_eq!(dfa.match_pattern(state, 2).as_usize(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn match_len(&self, id: StateID) -> usize;
+
+ /// Returns the pattern ID corresponding to the given match index in the
+ /// given state.
+ ///
+ /// See [`Automaton::match_len`] for an example of how to use this
+ /// method correctly. Note that if you know your DFA is compiled with a
+ /// single pattern, then this routine is never necessary since it will
+ /// always return a pattern ID of `0` for an index of `0` when `id`
+ /// corresponds to a match state.
+ ///
+ /// Typically, this routine is used when implementing an overlapping
+ /// search, as the example for `Automaton::match_len` does.
+ ///
+ /// # Panics
+ ///
+ /// If the state ID is not a match state or if the match index is out
+ /// of bounds for the given state, then this routine may either panic
+ /// or produce an incorrect result. If the state ID is correct and the
+ /// match index is correct, then this routine must always produce a valid
+ /// `PatternID`.
+ fn match_pattern(&self, id: StateID, index: usize) -> PatternID;
+
+ /// Returns true if and only if this automaton can match the empty string.
+ /// When it returns false, all possible matches are guaranteed to have a
+ /// non-zero length.
+ ///
+ /// This is useful as cheap way to know whether code needs to handle the
+ /// case of a zero length match. This is particularly important when UTF-8
+ /// modes are enabled, as when UTF-8 mode is enabled, empty matches that
+ /// split a codepoint must never be reported. This extra handling can
+ /// sometimes be costly, and since regexes matching an empty string are
+ /// somewhat rare, it can be beneficial to treat such regexes specially.
+ ///
+ /// # Example
+ ///
+ /// This example shows a few different DFAs and whether they match the
+ /// empty string or not. Notice the empty string isn't merely a matter
+ /// of a string of length literally `0`, but rather, whether a match can
+ /// occur between specific pairs of bytes.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{dense::DFA, Automaton}, util::syntax};
+ ///
+ /// // The empty regex matches the empty string.
+ /// let dfa = DFA::new("")?;
+ /// assert!(dfa.has_empty(), "empty matches empty");
+ /// // The '+' repetition operator requires at least one match, and so
+ /// // does not match the empty string.
+ /// let dfa = DFA::new("a+")?;
+ /// assert!(!dfa.has_empty(), "+ does not match empty");
+ /// // But the '*' repetition operator does.
+ /// let dfa = DFA::new("a*")?;
+ /// assert!(dfa.has_empty(), "* does match empty");
+ /// // And wrapping '+' in an operator that can match an empty string also
+ /// // causes it to match the empty string too.
+ /// let dfa = DFA::new("(a+)*")?;
+ /// assert!(dfa.has_empty(), "+ inside of * matches empty");
+ ///
+ /// // If a regex is just made of a look-around assertion, even if the
+ /// // assertion requires some kind of non-empty string around it (such as
+ /// // \b), then it is still treated as if it matches the empty string.
+ /// // Namely, if a match occurs of just a look-around assertion, then the
+ /// // match returned is empty.
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().unicode_word_boundary(true))
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .build(r"^$\A\z\b\B(?-u:\b\B)")?;
+ /// assert!(dfa.has_empty(), "assertions match empty");
+ /// // Even when an assertion is wrapped in a '+', it still matches the
+ /// // empty string.
+ /// let dfa = DFA::new(r"^+")?;
+ /// assert!(dfa.has_empty(), "+ of an assertion matches empty");
+ ///
+ /// // An alternation with even one branch that can match the empty string
+ /// // is also said to match the empty string overall.
+ /// let dfa = DFA::new("foo|(bar)?|quux")?;
+ /// assert!(dfa.has_empty(), "alternations can match empty");
+ ///
+ /// // An NFA that matches nothing does not match the empty string.
+ /// let dfa = DFA::new("[a&&b]")?;
+ /// assert!(!dfa.has_empty(), "never matching means not matching empty");
+ /// // But if it's wrapped in something that doesn't require a match at
+ /// // all, then it can match the empty string!
+ /// let dfa = DFA::new("[a&&b]*")?;
+ /// assert!(dfa.has_empty(), "* on never-match still matches empty");
+ /// // Since a '+' requires a match, using it on something that can never
+ /// // match will itself produce a regex that can never match anything,
+ /// // and thus does not match the empty string.
+ /// let dfa = DFA::new("[a&&b]+")?;
+ /// assert!(!dfa.has_empty(), "+ on never-match still matches nothing");
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn has_empty(&self) -> bool;
+
+ /// Whether UTF-8 mode is enabled for this DFA or not.
+ ///
+ /// When UTF-8 mode is enabled, all matches reported by a DFA are
+ /// guaranteed to correspond to spans of valid UTF-8. This includes
+ /// zero-width matches. For example, the DFA must guarantee that the empty
+ /// regex will not match at the positions between code units in the UTF-8
+ /// encoding of a single codepoint.
+ ///
+ /// See [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) for
+ /// more information.
+ ///
+ /// # Example
+ ///
+ /// This example shows how UTF-8 mode can impact the match spans that may
+ /// be reported in certain cases.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// nfa::thompson,
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// // UTF-8 mode is enabled by default.
+ /// let re = DFA::new("")?;
+ /// assert!(re.is_utf8());
+ /// let mut input = Input::new("☃");
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 0)), got);
+ ///
+ /// // Even though an empty regex matches at 1..1, our next match is
+ /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
+ /// // three bytes long).
+ /// input.set_start(1);
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 3)), got);
+ ///
+ /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
+ /// let re = DFA::builder()
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build("")?;
+ /// assert!(!re.is_utf8());
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 1)), got);
+ ///
+ /// input.set_start(2);
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 2)), got);
+ ///
+ /// input.set_start(3);
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(Some(HalfMatch::must(0, 3)), got);
+ ///
+ /// input.set_start(4);
+ /// let got = re.try_search_fwd(&input)?;
+ /// assert_eq!(None, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn is_utf8(&self) -> bool;
+
+ /// Returns true if and only if this DFA is limited to returning matches
+ /// whose start position is `0`.
+ ///
+ /// Note that if you're using DFAs provided by
+ /// this crate, then this is _orthogonal_ to
+ /// [`Config::start_kind`](crate::dfa::dense::Config::start_kind).
+ ///
+ /// This is useful in some cases because if a DFA is limited to producing
+ /// matches that start at offset `0`, then a reverse search is never
+ /// required for finding the start of a match.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::{dense::DFA, Automaton};
+ ///
+ /// // The empty regex matches anywhere
+ /// let dfa = DFA::new("")?;
+ /// assert!(!dfa.is_always_start_anchored(), "empty matches anywhere");
+ /// // 'a' matches anywhere.
+ /// let dfa = DFA::new("a")?;
+ /// assert!(!dfa.is_always_start_anchored(), "'a' matches anywhere");
+ /// // '^' only matches at offset 0!
+ /// let dfa = DFA::new("^a")?;
+ /// assert!(dfa.is_always_start_anchored(), "'^a' matches only at 0");
+ /// // But '(?m:^)' matches at 0 but at other offsets too.
+ /// let dfa = DFA::new("(?m:^)a")?;
+ /// assert!(!dfa.is_always_start_anchored(), "'(?m:^)a' matches anywhere");
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ fn is_always_start_anchored(&self) -> bool;
+
+ /// Return a slice of bytes to accelerate for the given state, if possible.
+ ///
+ /// If the given state has no accelerator, then an empty slice must be
+ /// returned. If `Automaton::is_accel_state` returns true for the given ID,
+ /// then this routine _must_ return a non-empty slice. But note that it is
+ /// not required for an implementation of this trait to ever return `true`
+ /// for `is_accel_state`, even if the state _could_ be accelerated. That
+ /// is, acceleration is an optional optimization. But the return values of
+ /// `is_accel_state` and `accelerator` must be in sync.
+ ///
+ /// If the given ID is not a valid state ID for this automaton, then
+ /// implementations may panic or produce incorrect results.
+ ///
+ /// See [`Automaton::is_accel_state`] for more details on state
+ /// acceleration.
+ ///
+ /// By default, this method will always return an empty slice.
+ ///
+ /// # Example
+ ///
+ /// This example shows a contrived case in which we build a regex that we
+ /// know is accelerated and extract the accelerator from a state.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// util::{primitives::StateID, syntax},
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// // We disable Unicode everywhere and permit the regex to match
+ /// // invalid UTF-8. e.g., [^abc] matches \xFF, which is not valid
+ /// // UTF-8. If we left Unicode enabled, [^abc] would match any UTF-8
+ /// // encoding of any Unicode scalar value except for 'a', 'b' or 'c'.
+ /// // That translates to a much more complicated DFA, and also
+ /// // inhibits the 'accelerator' optimization that we are trying to
+ /// // demonstrate in this example.
+ /// .syntax(syntax::Config::new().unicode(false).utf8(false))
+ /// .build("[^abc]+a")?;
+ ///
+ /// // Here we just pluck out the state that we know is accelerated.
+ /// // While the stride calculations are something that can be relied
+ /// // on by callers, the specific position of the accelerated state is
+ /// // implementation defined.
+ /// //
+ /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'.
+ /// // e.g., try `regex-cli debug dfa dense '[^abc]+a' -BbUC`.
+ /// let id = StateID::new(3 * dfa.stride()).unwrap();
+ /// let accelerator = dfa.accelerator(id);
+ /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated.
+ /// assert_eq!(accelerator, &[b'a', b'b', b'c']);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn accelerator(&self, _id: StateID) -> &[u8] {
+ &[]
+ }
+
+ /// Returns the prefilter associated with a DFA, if one exists.
+ ///
+ /// The default implementation of this trait always returns `None`. And
+ /// indeed, it is always correct to return `None`.
+ ///
+ /// For DFAs in this crate, a prefilter can be attached to a DFA via
+ /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter).
+ ///
+ /// Do note that prefilters are not serialized by DFAs in this crate.
+ /// So if you deserialize a DFA that had a prefilter attached to it
+ /// at serialization time, then it will not have a prefilter after
+ /// deserialization.
+ #[inline]
+ fn get_prefilter(&self) -> Option<&Prefilter> {
+ None
+ }
+
+ /// Executes a forward search and returns the end position of the leftmost
+ /// match that is found. If no match exists, then `None` is returned.
+ ///
+ /// In particular, this method continues searching even after it enters
+ /// a match state. The search only terminates once it has reached the
+ /// end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Notes for implementors
+ ///
+ /// Implementors of this trait are not required to implement any particular
+ /// match semantics (such as leftmost-first), which are instead manifest in
+ /// the DFA's transitions. But this search routine should behave as a
+ /// general "leftmost" search.
+ ///
+ /// In particular, this method must continue searching even after it enters
+ /// a match state. The search should only terminate once it has reached
+ /// the end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// Since this trait provides an implementation for this method by default,
+ /// it's unlikely that one will need to implement this.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`dense::DFA`](crate::dfa::dense::DFA).
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
+ ///
+ /// let dfa = dense::DFA::new("foo[0-9]+")?;
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"foo12345"))?);
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = dense::DFA::new("abc|a")?;
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"abc"))?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi-DFA that permits searching for
+ /// specific patterns.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// Anchored, HalfMatch, PatternID, Input,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+ /// let haystack = "foo123".as_bytes();
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.try_search_fwd(&Input::new(haystack))?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(1)));
+ /// let expected = Some(HalfMatch::must(1, 6));
+ /// let got = dfa.try_search_fwd(&input)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
+ ///
+ /// // N.B. We disable Unicode here so that we use a simple ASCII word
+ /// // boundary. Alternatively, we could enable heuristic support for
+ /// // Unicode word boundaries.
+ /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?;
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about the
+ /// // larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `3` instead of `6`.
+ /// let input = Input::new(&haystack[3..6]);
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// let got = dfa.try_search_fwd(&input)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let input = Input::new(haystack).range(3..6);
+ /// let expected = None;
+ /// let got = dfa.try_search_fwd(&input)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn try_search_fwd(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ let utf8empty = self.has_empty() && self.is_utf8();
+ let hm = match search::find_fwd(&self, input)? {
+ None => return Ok(None),
+ Some(hm) if !utf8empty => return Ok(Some(hm)),
+ Some(hm) => hm,
+ };
+ // We get to this point when we know our DFA can match the empty string
+ // AND when UTF-8 mode is enabled. In this case, we skip any matches
+ // whose offset splits a codepoint. Such a match is necessarily a
+ // zero-width match, because UTF-8 mode requires the underlying NFA
+ // to be built such that all non-empty matches span valid UTF-8.
+ // Therefore, any match that ends in the middle of a codepoint cannot
+ // be part of a span of valid UTF-8 and thus must be an empty match.
+ // In such cases, we skip it, so as not to report matches that split a
+ // codepoint.
+ //
+ // Note that this is not a checked assumption. Callers *can* provide an
+ // NFA with UTF-8 mode enabled but produces non-empty matches that span
+ // invalid UTF-8. But doing so is documented to result in unspecified
+ // behavior.
+ empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
+ let got = search::find_fwd(&self, input)?;
+ Ok(got.map(|hm| (hm, hm.offset())))
+ })
+ }
+
+ /// Executes a reverse search and returns the start of the position of the
+ /// leftmost match that is found. If no match exists, then `None` is
+ /// returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this method with a
+ /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this
+ /// routine is principally useful when used in conjunction with the
+ /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
+ /// configuration. In general, it's unlikely to be correct to use
+ /// both `try_search_fwd` and `try_search_rev` with the same DFA since
+ /// any particular DFA will only support searching in one direction with
+ /// respect to the pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson,
+ /// dfa::{Automaton, dense},
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("foo[0-9]+")?;
+ /// let expected = Some(HalfMatch::must(0, 0));
+ /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"foo12345"))?);
+ ///
+ /// // Even though a match is found after reading the last byte (`c`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = dense::Builder::new()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("abc|c")?;
+ /// let expected = Some(HalfMatch::must(0, 0));
+ /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"abc"))?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: UTF-8 mode
+ ///
+ /// This examples demonstrates that UTF-8 mode applies to reverse
+ /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
+ /// matches reported must correspond to valid UTF-8 spans. This includes
+ /// prohibiting zero-width matches that split a codepoint.
+ ///
+ /// UTF-8 mode is enabled by default. Notice below how the only zero-width
+ /// matches reported are those at UTF-8 boundaries:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build(r"")?;
+ ///
+ /// // Run the reverse DFA to collect all matches.
+ /// let mut input = Input::new("☃");
+ /// let mut matches = vec![];
+ /// loop {
+ /// match dfa.try_search_rev(&input)? {
+ /// None => break,
+ /// Some(hm) => {
+ /// matches.push(hm);
+ /// if hm.offset() == 0 || input.end() == 0 {
+ /// break;
+ /// } else if hm.offset() < input.end() {
+ /// input.set_end(hm.offset());
+ /// } else {
+ /// // This is only necessary to handle zero-width
+ /// // matches, which of course occur in this example.
+ /// // Without this, the search would never advance
+ /// // backwards beyond the initial match.
+ /// input.set_end(input.end() - 1);
+ /// }
+ /// }
+ /// }
+ /// }
+ ///
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Now let's look at the same example, but with UTF-8 mode on the
+ /// original NFA disabled (which results in disabling UTF-8 mode on the
+ /// DFA):
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true).utf8(false))
+ /// .build(r"")?;
+ ///
+ /// // Run the reverse DFA to collect all matches.
+ /// let mut input = Input::new("☃");
+ /// let mut matches = vec![];
+ /// loop {
+ /// match dfa.try_search_rev(&input)? {
+ /// None => break,
+ /// Some(hm) => {
+ /// matches.push(hm);
+ /// if hm.offset() == 0 || input.end() == 0 {
+ /// break;
+ /// } else if hm.offset() < input.end() {
+ /// input.set_end(hm.offset());
+ /// } else {
+ /// // This is only necessary to handle zero-width
+ /// // matches, which of course occur in this example.
+ /// // Without this, the search would never advance
+ /// // backwards beyond the initial match.
+ /// input.set_end(input.end() - 1);
+ /// }
+ /// }
+ /// }
+ /// }
+ ///
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 2),
+ /// HalfMatch::must(0, 1),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn try_search_rev(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ let utf8empty = self.has_empty() && self.is_utf8();
+ let hm = match search::find_rev(self, input)? {
+ None => return Ok(None),
+ Some(hm) if !utf8empty => return Ok(Some(hm)),
+ Some(hm) => hm,
+ };
+ empty::skip_splits_rev(input, hm, hm.offset(), |input| {
+ let got = search::find_rev(self, input)?;
+ Ok(got.map(|hm| (hm, hm.offset())))
+ })
+ }
+
+ /// Executes an overlapping forward search. Matches, if one exists, can be
+ /// obtained via the [`OverlappingState::get_match`] method.
+ ///
+ /// This routine is principally only useful when searching for multiple
+ /// patterns on inputs where multiple patterns may match the same regions
+ /// of text. In particular, callers must preserve the automaton's search
+ /// state from prior calls so that the implementation knows where the last
+ /// match occurred.
+ ///
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should always be set to the end
+ /// of the last match. If more patterns match at the previous location,
+ /// then they will be immediately returned. (This is tracked by the given
+ /// overlapping state.) Otherwise, the search continues at the starting
+ /// position given.
+ ///
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run a basic overlapping search with a
+ /// [`dense::DFA`](crate::dfa::dense::DFA). Notice that we build the
+ /// automaton with a `MatchKind::All` configuration. Overlapping searches
+ /// are unlikely to work as one would expect when using the default
+ /// `MatchKind::LeftmostFirst` match semantics, since leftmost-first
+ /// matching is fundamentally incompatible with overlapping searches.
+ /// Namely, overlapping searches need to report matches as they are seen,
+ /// where as leftmost-first searches will continue searching even after a
+ /// match has been observed in order to find the conventional end position
+ /// of the match. More concretely, leftmost-first searches use dead states
+ /// to terminate a search after a specific match can no longer be extended.
+ /// Overlapping searches instead do the opposite by continuing the search
+ /// to find totally new matches (potentially of other patterns).
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::{Automaton, OverlappingState, dense},
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().match_kind(MatchKind::All))
+ /// .build_many(&[r"[[:word:]]+$", r"[[:^space:]]+$"])?;
+ /// let haystack = "@foo";
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(HalfMatch::must(1, 4));
+ /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?;
+ /// assert_eq!(expected, state.get_match());
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?;
+ /// assert_eq!(expected, state.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn try_search_overlapping_fwd(
+ &self,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ let utf8empty = self.has_empty() && self.is_utf8();
+ search::find_overlapping_fwd(self, input, state)?;
+ match state.get_match() {
+ None => Ok(()),
+ Some(_) if !utf8empty => Ok(()),
+ Some(_) => skip_empty_utf8_splits_overlapping(
+ input,
+ state,
+ |input, state| {
+ search::find_overlapping_fwd(self, input, state)
+ },
+ ),
+ }
+ }
+
+ /// Executes a reverse overlapping forward search. Matches, if one exists,
+ /// can be obtained via the [`OverlappingState::get_match`] method.
+ ///
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should remain invariant throughout
+ /// iteration. The `OverlappingState` given to the search will keep track
+ /// of the current position of the search. (This is because multiple
+ /// matches may be reported at the same position, so only the search
+ /// implementation itself knows when to advance the position.)
+ ///
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example: UTF-8 mode
+ ///
+ /// This examples demonstrates that UTF-8 mode applies to reverse
+ /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
+ /// matches reported must correspond to valid UTF-8 spans. This includes
+ /// prohibiting zero-width matches that split a codepoint.
+ ///
+ /// UTF-8 mode is enabled by default. Notice below how the only zero-width
+ /// matches reported are those at UTF-8 boundaries:
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton, OverlappingState},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build_many(&[r"", r"☃"])?;
+ ///
+ /// // Run the reverse DFA to collect all matches.
+ /// let input = Input::new("☃");
+ /// let mut state = OverlappingState::start();
+ /// let mut matches = vec![];
+ /// loop {
+ /// dfa.try_search_overlapping_rev(&input, &mut state)?;
+ /// match state.get_match() {
+ /// None => break,
+ /// Some(hm) => matches.push(hm),
+ /// }
+ /// }
+ ///
+ /// // No matches split a codepoint.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(1, 0),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Now let's look at the same example, but with UTF-8 mode on the
+ /// original NFA disabled (which results in disabling UTF-8 mode on the
+ /// DFA):
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton, OverlappingState},
+ /// nfa::thompson,
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .thompson(thompson::Config::new().reverse(true).utf8(false))
+ /// .build_many(&[r"", r"☃"])?;
+ ///
+ /// // Run the reverse DFA to collect all matches.
+ /// let input = Input::new("☃");
+ /// let mut state = OverlappingState::start();
+ /// let mut matches = vec![];
+ /// loop {
+ /// dfa.try_search_overlapping_rev(&input, &mut state)?;
+ /// match state.get_match() {
+ /// None => break,
+ /// Some(hm) => matches.push(hm),
+ /// }
+ /// }
+ ///
+ /// // Now *all* positions match, even within a codepoint,
+ /// // because we lifted the requirement that matches
+ /// // correspond to valid UTF-8 spans.
+ /// let expected = vec![
+ /// HalfMatch::must(0, 3),
+ /// HalfMatch::must(0, 2),
+ /// HalfMatch::must(0, 1),
+ /// HalfMatch::must(1, 0),
+ /// HalfMatch::must(0, 0),
+ /// ];
+ /// assert_eq!(expected, matches);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ fn try_search_overlapping_rev(
+ &self,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ let utf8empty = self.has_empty() && self.is_utf8();
+ search::find_overlapping_rev(self, input, state)?;
+ match state.get_match() {
+ None => Ok(()),
+ Some(_) if !utf8empty => Ok(()),
+ Some(_) => skip_empty_utf8_splits_overlapping(
+ input,
+ state,
+ |input, state| {
+ search::find_overlapping_rev(self, input, state)
+ },
+ ),
+ }
+ }
+
+ /// Writes the set of patterns that match anywhere in the given search
+ /// configuration to `patset`. If multiple patterns match at the same
+ /// position and the underlying DFA supports overlapping matches, then all
+ /// matching patterns are written to the given set.
+ ///
+ /// Unless all of the patterns in this DFA are anchored, then generally
+ /// speaking, this will visit every byte in the haystack.
+ ///
+ /// This search routine *does not* clear the pattern set. This gives some
+ /// flexibility to the caller (e.g., running multiple searches with the
+ /// same pattern set), but does make the API bug-prone if you're reusing
+ /// the same pattern set for multiple searches but intended them to be
+ /// independent.
+ ///
+ /// If a pattern ID matched but the given `PatternSet` does not have
+ /// sufficient capacity to store it, then it is not inserted and silently
+ /// dropped.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find all matching patterns in a haystack,
+ /// even when some patterns match at the same position as other patterns.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense::DFA},
+ /// Input, MatchKind, PatternSet,
+ /// };
+ ///
+ /// let patterns = &[
+ /// r"[[:word:]]+",
+ /// r"[0-9]+",
+ /// r"[[:alpha:]]+",
+ /// r"foo",
+ /// r"bar",
+ /// r"barfoo",
+ /// r"foobar",
+ /// ];
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(patterns)?;
+ ///
+ /// let input = Input::new("foobar");
+ /// let mut patset = PatternSet::new(dfa.pattern_len());
+ /// dfa.try_which_overlapping_matches(&input, &mut patset)?;
+ /// let expected = vec![0, 2, 3, 4, 6];
+ /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn try_which_overlapping_matches(
+ &self,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) -> Result<(), MatchError> {
+ let mut state = OverlappingState::start();
+ while let Some(m) = {
+ self.try_search_overlapping_fwd(input, &mut state)?;
+ state.get_match()
+ } {
+ let _ = patset.insert(m.pattern());
+ // There's nothing left to find, so we can stop. Or the caller
+ // asked us to.
+ if patset.is_full() || input.get_earliest() {
+ break;
+ }
+ }
+ Ok(())
+ }
+}
+
+unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A {
+ #[inline]
+ fn next_state(&self, current: StateID, input: u8) -> StateID {
+ (**self).next_state(current, input)
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(
+ &self,
+ current: StateID,
+ input: u8,
+ ) -> StateID {
+ (**self).next_state_unchecked(current, input)
+ }
+
+ #[inline]
+ fn next_eoi_state(&self, current: StateID) -> StateID {
+ (**self).next_eoi_state(current)
+ }
+
+ #[inline]
+ fn start_state_forward(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ (**self).start_state_forward(input)
+ }
+
+ #[inline]
+ fn start_state_reverse(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ (**self).start_state_reverse(input)
+ }
+
+ #[inline]
+ fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
+ (**self).universal_start_state(mode)
+ }
+
+ #[inline]
+ fn is_special_state(&self, id: StateID) -> bool {
+ (**self).is_special_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: StateID) -> bool {
+ (**self).is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_quit_state(&self, id: StateID) -> bool {
+ (**self).is_quit_state(id)
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: StateID) -> bool {
+ (**self).is_match_state(id)
+ }
+
+ #[inline]
+ fn is_start_state(&self, id: StateID) -> bool {
+ (**self).is_start_state(id)
+ }
+
+ #[inline]
+ fn is_accel_state(&self, id: StateID) -> bool {
+ (**self).is_accel_state(id)
+ }
+
+ #[inline]
+ fn pattern_len(&self) -> usize {
+ (**self).pattern_len()
+ }
+
+ #[inline]
+ fn match_len(&self, id: StateID) -> usize {
+ (**self).match_len(id)
+ }
+
+ #[inline]
+ fn match_pattern(&self, id: StateID, index: usize) -> PatternID {
+ (**self).match_pattern(id, index)
+ }
+
+ #[inline]
+ fn has_empty(&self) -> bool {
+ (**self).has_empty()
+ }
+
+ #[inline]
+ fn is_utf8(&self) -> bool {
+ (**self).is_utf8()
+ }
+
+ #[inline]
+ fn is_always_start_anchored(&self) -> bool {
+ (**self).is_always_start_anchored()
+ }
+
+ #[inline]
+ fn accelerator(&self, id: StateID) -> &[u8] {
+ (**self).accelerator(id)
+ }
+
+ #[inline]
+ fn get_prefilter(&self) -> Option<&Prefilter> {
+ (**self).get_prefilter()
+ }
+
+ #[inline]
+ fn try_search_fwd(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).try_search_fwd(input)
+ }
+
+ #[inline]
+ fn try_search_rev(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ (**self).try_search_rev(input)
+ }
+
+ #[inline]
+ fn try_search_overlapping_fwd(
+ &self,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ (**self).try_search_overlapping_fwd(input, state)
+ }
+
+ #[inline]
+ fn try_search_overlapping_rev(
+ &self,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ ) -> Result<(), MatchError> {
+ (**self).try_search_overlapping_rev(input, state)
+ }
+
+ #[cfg(feature = "alloc")]
+ #[inline]
+ fn try_which_overlapping_matches(
+ &self,
+ input: &Input<'_>,
+ patset: &mut PatternSet,
+ ) -> Result<(), MatchError> {
+ (**self).try_which_overlapping_matches(input, patset)
+ }
+}
+
+/// Represents the current state of an overlapping search.
+///
+/// This is used for overlapping searches since they need to know something
+/// about the previous search. For example, when multiple patterns match at the
+/// same position, this state tracks the last reported pattern so that the next
+/// search knows whether to report another matching pattern or continue with
+/// the search at the next position. Additionally, it also tracks which state
+/// the last search call terminated in.
+///
+/// This type provides little introspection capabilities. The only thing a
+/// caller can do is construct it and pass it around to permit search routines
+/// to use it to track state, and also ask whether a match has been found.
+///
+/// Callers should always provide a fresh state constructed via
+/// [`OverlappingState::start`] when starting a new search. Reusing state from
+/// a previous search may result in incorrect results.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OverlappingState {
+ /// The match reported by the most recent overlapping search to use this
+ /// state.
+ ///
+ /// If a search does not find any matches, then it is expected to clear
+ /// this value.
+ pub(crate) mat: Option<HalfMatch>,
+ /// The state ID of the state at which the search was in when the call
+ /// terminated. When this is a match state, `last_match` must be set to a
+ /// non-None value.
+ ///
+ /// A `None` value indicates the start state of the corresponding
+ /// automaton. We cannot use the actual ID, since any one automaton may
+ /// have many start states, and which one is in use depends on several
+ /// search-time factors.
+ pub(crate) id: Option<StateID>,
+ /// The position of the search.
+ ///
+ /// When `id` is None (i.e., we are starting a search), this is set to
+ /// the beginning of the search as given by the caller regardless of its
+ /// current value. Subsequent calls to an overlapping search pick up at
+ /// this offset.
+ pub(crate) at: usize,
+ /// The index into the matching patterns of the next match to report if the
+ /// current state is a match state. Note that this may be 1 greater than
+ /// the total number of matches to report for the current match state. (In
+ /// which case, no more matches should be reported at the current position
+ /// and the search should advance to the next position.)
+ pub(crate) next_match_index: Option<usize>,
+ /// This is set to true when a reverse overlapping search has entered its
+ /// EOI transitions.
+ ///
+ /// This isn't used in a forward search because it knows to stop once the
+ /// position exceeds the end of the search range. In a reverse search,
+ /// since we use unsigned offsets, we don't "know" once we've gone past
+ /// `0`. So the only way to detect it is with this extra flag. The reverse
+ /// overlapping search knows to terminate specifically after it has
+ /// reported all matches after following the EOI transition.
+ pub(crate) rev_eoi: bool,
+}
+
+impl OverlappingState {
+ /// Create a new overlapping state that begins at the start state of any
+ /// automaton.
+ pub fn start() -> OverlappingState {
+ OverlappingState {
+ mat: None,
+ id: None,
+ at: 0,
+ next_match_index: None,
+ rev_eoi: false,
+ }
+ }
+
+ /// Return the match result of the most recent search to execute with this
+ /// state.
+ ///
+ /// A searches will clear this result automatically, such that if no
+ /// match is found, this will correctly report `None`.
+ pub fn get_match(&self) -> Option<HalfMatch> {
+ self.mat
+ }
+}
+
+/// Runs the given overlapping `search` function (forwards or backwards) until
+/// a match is found whose offset does not split a codepoint.
+///
+/// This is *not* always correct to call. It should only be called when the DFA
+/// has UTF-8 mode enabled *and* it can produce zero-width matches. Calling
+/// this when both of those things aren't true might result in legitimate
+/// matches getting skipped.
+#[cold]
+#[inline(never)]
+fn skip_empty_utf8_splits_overlapping<F>(
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+ mut search: F,
+) -> Result<(), MatchError>
+where
+ F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>,
+{
+ // Note that this routine works for forwards and reverse searches
+ // even though there's no code here to handle those cases. That's
+ // because overlapping searches drive themselves to completion via
+ // `OverlappingState`. So all we have to do is push it until no matches are
+ // found.
+
+ let mut hm = match state.get_match() {
+ None => return Ok(()),
+ Some(hm) => hm,
+ };
+ if input.get_anchored().is_anchored() {
+ if !input.is_char_boundary(hm.offset()) {
+ state.mat = None;
+ }
+ return Ok(());
+ }
+ while !input.is_char_boundary(hm.offset()) {
+ search(input, state)?;
+ hm = match state.get_match() {
+ None => return Ok(()),
+ Some(hm) => hm,
+ };
+ }
+ Ok(())
+}
+
+/// Write a prefix "state" indicator for fmt::Debug impls.
+///
+/// Specifically, this tries to succinctly distinguish the different types of
+/// states: dead states, quit states, accelerated states, start states and
+/// match states. It even accounts for the possible overlappings of different
+/// state types.
+pub(crate) fn fmt_state_indicator<A: Automaton>(
+ f: &mut core::fmt::Formatter<'_>,
+ dfa: A,
+ id: StateID,
+) -> core::fmt::Result {
+ if dfa.is_dead_state(id) {
+ write!(f, "D")?;
+ if dfa.is_start_state(id) {
+ write!(f, ">")?;
+ } else {
+ write!(f, " ")?;
+ }
+ } else if dfa.is_quit_state(id) {
+ write!(f, "Q ")?;
+ } else if dfa.is_start_state(id) {
+ if dfa.is_accel_state(id) {
+ write!(f, "A>")?;
+ } else {
+ write!(f, " >")?;
+ }
+ } else if dfa.is_match_state(id) {
+ if dfa.is_accel_state(id) {
+ write!(f, "A*")?;
+ } else {
+ write!(f, " *")?;
+ }
+ } else if dfa.is_accel_state(id) {
+ write!(f, "A ")?;
+ } else {
+ write!(f, " ")?;
+ }
+ Ok(())
+}
+
+#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
+mod tests {
+ // A basic test ensuring that our Automaton trait is object safe. (This is
+ // the main reason why we don't define the search routines as generic over
+ // Into<Input>.)
+ #[test]
+ fn object_safe() {
+ use crate::{
+ dfa::{dense, Automaton},
+ HalfMatch, Input,
+ };
+
+ let dfa = dense::DFA::new("abc").unwrap();
+ let dfa: &dyn Automaton = &dfa;
+ assert_eq!(
+ Ok(Some(HalfMatch::must(0, 6))),
+ dfa.try_search_fwd(&Input::new(b"xyzabcxyz")),
+ );
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/dense.rs b/third_party/rust/regex-automata/src/dfa/dense.rs
new file mode 100644
index 0000000000..6da865f977
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/dense.rs
@@ -0,0 +1,5139 @@
+/*!
+Types and routines specific to dense DFAs.
+
+This module is the home of [`dense::DFA`](DFA).
+
+This module also contains a [`dense::Builder`](Builder) and a
+[`dense::Config`](Config) for building and configuring a dense DFA.
+*/
+
+#[cfg(feature = "dfa-build")]
+use core::cmp;
+use core::{convert::TryFrom, fmt, iter, mem::size_of, slice};
+
+#[cfg(feature = "dfa-build")]
+use alloc::{
+ collections::{BTreeMap, BTreeSet},
+ vec,
+ vec::Vec,
+};
+
+#[cfg(feature = "dfa-build")]
+use crate::{
+ dfa::{
+ accel::Accel, determinize, minimize::Minimizer, remapper::Remapper,
+ sparse,
+ },
+ nfa::thompson,
+ util::{look::LookMatcher, search::MatchKind},
+};
+use crate::{
+ dfa::{
+ accel::Accels,
+ automaton::{fmt_state_indicator, Automaton},
+ special::Special,
+ start::StartKind,
+ DEAD,
+ },
+ util::{
+ alphabet::{self, ByteClasses, ByteSet},
+ int::{Pointer, Usize},
+ prefilter::Prefilter,
+ primitives::{PatternID, StateID},
+ search::{Anchored, Input, MatchError},
+ start::{Start, StartByteMap},
+ wire::{self, DeserializeError, Endian, SerializeError},
+ },
+};
+
+/// The label that is pre-pended to a serialized DFA.
+const LABEL: &str = "rust-regex-automata-dfa-dense";
+
+/// The format version of dense regexes. This version gets incremented when a
+/// change occurs. A change may not necessarily be a breaking change, but the
+/// version does permit good error messages in the case where a breaking change
+/// is made.
+const VERSION: u32 = 2;
+
+/// The configuration used for compiling a dense DFA.
+///
+/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
+/// advantage of the former is that it often lets you avoid importing the
+/// `Config` type directly.
+///
+/// A dense DFA configuration is a simple data object that is typically used
+/// with [`dense::Builder::configure`](self::Builder::configure).
+///
+/// The default configuration guarantees that a search will never return
+/// a "quit" error, although it is possible for a search to fail if
+/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
+/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
+#[cfg(feature = "dfa-build")]
+#[derive(Clone, Debug, Default)]
+pub struct Config {
+ // As with other configuration types in this crate, we put all our knobs
+ // in options so that we can distinguish between "default" and "not set."
+ // This makes it possible to easily combine multiple configurations
+ // without default values overwriting explicitly specified values. See the
+ // 'overwrite' method.
+ //
+ // For docs on the fields below, see the corresponding method setters.
+ accelerate: Option<bool>,
+ pre: Option<Option<Prefilter>>,
+ minimize: Option<bool>,
+ match_kind: Option<MatchKind>,
+ start_kind: Option<StartKind>,
+ starts_for_each_pattern: Option<bool>,
+ byte_classes: Option<bool>,
+ unicode_word_boundary: Option<bool>,
+ quitset: Option<ByteSet>,
+ specialize_start_states: Option<bool>,
+ dfa_size_limit: Option<Option<usize>>,
+ determinize_size_limit: Option<Option<usize>>,
+}
+
+#[cfg(feature = "dfa-build")]
+impl Config {
+ /// Return a new default dense DFA compiler configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Enable state acceleration.
+ ///
+ /// When enabled, DFA construction will analyze each state to determine
+ /// whether it is eligible for simple acceleration. Acceleration typically
+ /// occurs when most of a state's transitions loop back to itself, leaving
+ /// only a select few bytes that will exit the state. When this occurs,
+ /// other routines like `memchr` can be used to look for those bytes which
+ /// may be much faster than traversing the DFA.
+ ///
+ /// Callers may elect to disable this if consistent performance is more
+ /// desirable than variable performance. Namely, acceleration can sometimes
+ /// make searching slower than it otherwise would be if the transitions
+ /// that leave accelerated states are traversed frequently.
+ ///
+ /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for
+ /// an example.
+ ///
+ /// This is enabled by default.
+ pub fn accelerate(mut self, yes: bool) -> Config {
+ self.accelerate = Some(yes);
+ self
+ }
+
+ /// Set a prefilter to be used whenever a start state is entered.
+ ///
+ /// A [`Prefilter`] in this context is meant to accelerate searches by
+ /// looking for literal prefixes that every match for the corresponding
+ /// pattern (or patterns) must start with. Once a prefilter produces a
+ /// match, the underlying search routine continues on to try and confirm
+ /// the match.
+ ///
+ /// Be warned that setting a prefilter does not guarantee that the search
+ /// will be faster. While it's usually a good bet, if the prefilter
+ /// produces a lot of false positive candidates (i.e., positions matched
+ /// by the prefilter but not by the regex), then the overall result can
+ /// be slower than if you had just executed the regex engine without any
+ /// prefilters.
+ ///
+ /// Note that unless [`Config::specialize_start_states`] has been
+ /// explicitly set, then setting this will also enable (when `pre` is
+ /// `Some`) or disable (when `pre` is `None`) start state specialization.
+ /// This occurs because without start state specialization, a prefilter
+ /// is likely to be less effective. And without a prefilter, start state
+ /// specialization is usually pointless.
+ ///
+ /// **WARNING:** Note that prefilters are not preserved as part of
+ /// serialization. Serializing a DFA will drop its prefilter.
+ ///
+ /// By default no prefilter is set.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// util::prefilter::Prefilter,
+ /// Input, HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let input = Input::new("foo1 barfox bar");
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 11)),
+ /// re.try_search_fwd(&input)?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Be warned though that an incorrect prefilter can lead to incorrect
+ /// results!
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton},
+ /// util::prefilter::Prefilter,
+ /// Input, HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().prefilter(pre))
+ /// .build(r"(foo|bar)[a-z]+")?;
+ /// let input = Input::new("foo1 barfox bar");
+ /// assert_eq!(
+ /// // No match reported even though there clearly is one!
+ /// None,
+ /// re.try_search_fwd(&input)?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
+ self.pre = Some(pre);
+ if self.specialize_start_states.is_none() {
+ self.specialize_start_states =
+ Some(self.get_prefilter().is_some());
+ }
+ self
+ }
+
+ /// Minimize the DFA.
+ ///
+ /// When enabled, the DFA built will be minimized such that it is as small
+ /// as possible.
+ ///
+ /// Whether one enables minimization or not depends on the types of costs
+ /// you're willing to pay and how much you care about its benefits. In
+ /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
+ /// space, where `n` is the number of DFA states and `k` is the alphabet
+ /// size. In practice, minimization can be quite costly in terms of both
+ /// space and time, so it should only be done if you're willing to wait
+ /// longer to produce a DFA. In general, you might want a minimal DFA in
+ /// the following circumstances:
+ ///
+ /// 1. You would like to optimize for the size of the automaton. This can
+ /// manifest in one of two ways. Firstly, if you're converting the
+ /// DFA into Rust code (or a table embedded in the code), then a minimal
+ /// DFA will translate into a corresponding reduction in code size, and
+ /// thus, also the final compiled binary size. Secondly, if you are
+ /// building many DFAs and putting them on the heap, you'll be able to
+ /// fit more if they are smaller. Note though that building a minimal
+ /// DFA itself requires additional space; you only realize the space
+ /// savings once the minimal DFA is constructed (at which point, the
+ /// space used for minimization is freed).
+ /// 2. You've observed that a smaller DFA results in faster match
+ /// performance. Naively, this isn't guaranteed since there is no
+ /// inherent difference between matching with a bigger-than-minimal
+ /// DFA and a minimal DFA. However, a smaller DFA may make use of your
+ /// CPU's cache more efficiently.
+ /// 3. You are trying to establish an equivalence between regular
+ /// languages. The standard method for this is to build a minimal DFA
+ /// for each language and then compare them. If the DFAs are equivalent
+ /// (up to state renaming), then the languages are equivalent.
+ ///
+ /// Typically, minimization only makes sense as an offline process. That
+ /// is, one might minimize a DFA before serializing it to persistent
+ /// storage. In practical terms, minimization can take around an order of
+ /// magnitude more time than compiling the initial DFA via determinization.
+ ///
+ /// This option is disabled by default.
+ pub fn minimize(mut self, yes: bool) -> Config {
+ self.minimize = Some(yes);
+ self
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+ /// match semantics of Perl-like regex engines. That is, when multiple
+ /// patterns would match at the same leftmost position, the pattern that
+ /// appears first in the concrete syntax is chosen.
+ ///
+ /// Currently, the only other kind of match semantics supported is
+ /// [`MatchKind::All`]. This corresponds to classical DFA construction
+ /// where all possible matches are added to the DFA.
+ ///
+ /// Typically, `All` is used when one wants to execute an overlapping
+ /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
+ /// sense to use `All` with the various "leftmost" find routines, since the
+ /// leftmost routines depend on the `LeftmostFirst` automata construction
+ /// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA
+ /// as a way to terminate the search and report a match. `LeftmostFirst`
+ /// also supports non-greedy matches using this strategy where as `All`
+ /// does not.
+ ///
+ /// # Example: overlapping search
+ ///
+ /// This example shows the typical use of `MatchKind::All`, which is to
+ /// report overlapping matches.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::{Automaton, OverlappingState, dense},
+ /// HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let input = Input::new("@foo");
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(HalfMatch::must(1, 4));
+ /// dfa.try_search_overlapping_fwd(&input, &mut state)?;
+ /// assert_eq!(expected, state.get_match());
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// dfa.try_search_overlapping_fwd(&input, &mut state)?;
+ /// assert_eq!(expected, state.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: reverse automaton to find start of match
+ ///
+ /// Another example for using `MatchKind::All` is for constructing a
+ /// reverse automaton to find the start of a match. `All` semantics are
+ /// used for this in order to find the longest possible match, which
+ /// corresponds to the leftmost starting position.
+ ///
+ /// Note that if you need the starting position then
+ /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for
+ /// you, so it's usually not necessary to do this yourself.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense, Automaton, StartKind},
+ /// nfa::thompson::NFA,
+ /// Anchored, HalfMatch, Input, MatchKind,
+ /// };
+ ///
+ /// let haystack = "123foobar456".as_bytes();
+ /// let pattern = r"[a-z]+r";
+ ///
+ /// let dfa_fwd = dense::DFA::new(pattern)?;
+ /// let dfa_rev = dense::Builder::new()
+ /// .thompson(NFA::config().reverse(true))
+ /// .configure(dense::Config::new()
+ /// // This isn't strictly necessary since both anchored and
+ /// // unanchored searches are supported by default. But since
+ /// // finding the start-of-match only requires anchored searches,
+ /// // we can get rid of the unanchored configuration and possibly
+ /// // slim down our DFA considerably.
+ /// .start_kind(StartKind::Anchored)
+ /// .match_kind(MatchKind::All)
+ /// )
+ /// .build(pattern)?;
+ /// let expected_fwd = HalfMatch::must(0, 9);
+ /// let expected_rev = HalfMatch::must(0, 3);
+ /// let got_fwd = dfa_fwd.try_search_fwd(&Input::new(haystack))?.unwrap();
+ /// // Here we don't specify the pattern to search for since there's only
+ /// // one pattern and we're doing a leftmost search. But if this were an
+ /// // overlapping search, you'd need to specify the pattern that matched
+ /// // in the forward direction. (Otherwise, you might wind up finding the
+ /// // starting position of a match of some other pattern.) That in turn
+ /// // requires building the reverse automaton with starts_for_each_pattern
+ /// // enabled. Indeed, this is what Regex does internally.
+ /// let input = Input::new(haystack)
+ /// .range(..got_fwd.offset())
+ /// .anchored(Anchored::Yes);
+ /// let got_rev = dfa_rev.try_search_rev(&input)?.unwrap();
+ /// assert_eq!(expected_fwd, got_fwd);
+ /// assert_eq!(expected_rev, got_rev);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn match_kind(mut self, kind: MatchKind) -> Config {
+ self.match_kind = Some(kind);
+ self
+ }
+
+ /// The type of starting state configuration to use for a DFA.
+ ///
+ /// By default, the starting state configuration is [`StartKind::Both`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense::DFA, Automaton, StartKind},
+ /// Anchored, HalfMatch, Input,
+ /// };
+ ///
+ /// let haystack = "quux foo123";
+ /// let expected = HalfMatch::must(0, 11);
+ ///
+ /// // By default, DFAs support both anchored and unanchored searches.
+ /// let dfa = DFA::new(r"[0-9]+")?;
+ /// let input = Input::new(haystack);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
+ ///
+ /// // But if we only need anchored searches, then we can build a DFA
+ /// // that only supports anchored searches. This leads to a smaller DFA
+ /// // (potentially significantly smaller in some cases), but a DFA that
+ /// // will panic if you try to use it with an unanchored search.
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().start_kind(StartKind::Anchored))
+ /// .build(r"[0-9]+")?;
+ /// let input = Input::new(haystack)
+ /// .range(8..)
+ /// .anchored(Anchored::Yes);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn start_kind(mut self, kind: StartKind) -> Config {
+ self.start_kind = Some(kind);
+ self
+ }
+
+ /// Whether to compile a separate start state for each pattern in the
+ /// automaton.
+ ///
+ /// When enabled, a separate **anchored** start state is added for each
+ /// pattern in the DFA. When this start state is used, then the DFA will
+ /// only search for matches for the pattern specified, even if there are
+ /// other patterns in the DFA.
+ ///
+ /// The main downside of this option is that it can potentially increase
+ /// the size of the DFA and/or increase the time it takes to build the DFA.
+ ///
+ /// There are a few reasons one might want to enable this (it's disabled
+ /// by default):
+ ///
+ /// 1. When looking for the start of an overlapping match (using a
+ /// reverse DFA), doing it correctly requires starting the reverse search
+ /// using the starting state of the pattern that matched in the forward
+ /// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex),
+ /// it will automatically enable this option when building the reverse DFA
+ /// internally.
+ /// 2. When you want to use a DFA with multiple patterns to both search
+ /// for matches of any pattern or to search for anchored matches of one
+ /// particular pattern while using the same DFA. (Otherwise, you would need
+ /// to compile a new DFA for each pattern.)
+ /// 3. Since the start states added for each pattern are anchored, if you
+ /// compile an unanchored DFA with one pattern while also enabling this
+ /// option, then you can use the same DFA to perform anchored or unanchored
+ /// searches. The latter you get with the standard search APIs. The former
+ /// you get from the various `_at` search methods that allow you specify a
+ /// pattern ID to search for.
+ ///
+ /// By default this is disabled.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this option to permit the same DFA to
+ /// run both anchored and unanchored searches for a single pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{dense, Automaton},
+ /// Anchored, HalfMatch, PatternID, Input,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().starts_for_each_pattern(true))
+ /// .build(r"foo[0-9]+")?;
+ /// let haystack = "quux foo123";
+ ///
+ /// // Here's a normal unanchored search. Notice that we use 'None' for the
+ /// // pattern ID. Since the DFA was built as an unanchored machine, it
+ /// // use its default unanchored starting state.
+ /// let expected = HalfMatch::must(0, 11);
+ /// let input = Input::new(haystack);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
+ /// // But now if we explicitly specify the pattern to search ('0' being
+ /// // the only pattern in the DFA), then it will use the starting state
+ /// // for that specific pattern which is always anchored. Since the
+ /// // pattern doesn't have a match at the beginning of the haystack, we
+ /// // find nothing.
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(0)));
+ /// assert_eq!(None, dfa.try_search_fwd(&input)?);
+ /// // And finally, an anchored search is not the same as putting a '^' at
+ /// // beginning of the pattern. An anchored search can only match at the
+ /// // beginning of the *search*, which we can change:
+ /// let input = Input::new(haystack)
+ /// .anchored(Anchored::Pattern(PatternID::must(0)))
+ /// .range(5..);
+ /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
+ self.starts_for_each_pattern = Some(yes);
+ self
+ }
+
+ /// Whether to attempt to shrink the size of the DFA's alphabet or not.
+ ///
+ /// This option is enabled by default and should never be disabled unless
+ /// one is debugging a generated DFA.
+ ///
+ /// When enabled, the DFA will use a map from all possible bytes to their
+ /// corresponding equivalence class. Each equivalence class represents a
+ /// set of bytes that does not discriminate between a match and a non-match
+ /// in the DFA. For example, the pattern `[ab]+` has at least two
+ /// equivalence classes: a set containing `a` and `b` and a set containing
+ /// every byte except for `a` and `b`. `a` and `b` are in the same
+ /// equivalence class because they never discriminate between a match and a
+ /// non-match.
+ ///
+ /// The advantage of this map is that the size of the transition table
+ /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to
+ /// `#states * k * sizeof(StateID)` where `k` is the number of equivalence
+ /// classes (rounded up to the nearest power of 2). As a result, total
+ /// space usage can decrease substantially. Moreover, since a smaller
+ /// alphabet is used, DFA compilation becomes faster as well.
+ ///
+ /// **WARNING:** This is only useful for debugging DFAs. Disabling this
+ /// does not yield any speed advantages. Namely, even when this is
+ /// disabled, a byte class map is still used while searching. The only
+ /// difference is that every byte will be forced into its own distinct
+ /// equivalence class. This is useful for debugging the actual generated
+ /// transitions because it lets one see the transitions defined on actual
+ /// bytes instead of the equivalence classes.
+ pub fn byte_classes(mut self, yes: bool) -> Config {
+ self.byte_classes = Some(yes);
+ self
+ }
+
+ /// Heuristically enable Unicode word boundaries.
+ ///
+ /// When set, this will attempt to implement Unicode word boundaries as if
+ /// they were ASCII word boundaries. This only works when the search input
+ /// is ASCII only. If a non-ASCII byte is observed while searching, then a
+ /// [`MatchError::quit`](crate::MatchError::quit) error is returned.
+ ///
+ /// A possible alternative to enabling this option is to simply use an
+ /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
+ /// option is if you absolutely need Unicode support. This option lets one
+ /// use a fast search implementation (a DFA) for some potentially very
+ /// common cases, while providing the option to fall back to some other
+ /// regex engine to handle the general case when an error is returned.
+ ///
+ /// If the pattern provided has no Unicode word boundary in it, then this
+ /// option has no effect. (That is, quitting on a non-ASCII byte only
+ /// occurs when this option is enabled _and_ a Unicode word boundary is
+ /// present in the pattern.)
+ ///
+ /// This is almost equivalent to setting all non-ASCII bytes to be quit
+ /// bytes. The only difference is that this will cause non-ASCII bytes to
+ /// be quit bytes _only_ when a Unicode word boundary is present in the
+ /// pattern.
+ ///
+ /// When enabling this option, callers _must_ be prepared to handle
+ /// a [`MatchError`](crate::MatchError) error during search.
+ /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds
+ /// to using the `try_` suite of methods. Alternatively, if
+ /// callers can guarantee that their input is ASCII only, then a
+ /// [`MatchError::quit`](crate::MatchError::quit) error will never be
+ /// returned while searching.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to heuristically enable Unicode word boundaries
+ /// in a pattern. It also shows what happens when a search comes across a
+ /// non-ASCII byte.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// HalfMatch, Input, MatchError,
+ /// };
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().unicode_word_boundary(true))
+ /// .build(r"\b[0-9]+\b")?;
+ ///
+ /// // The match occurs before the search ever observes the snowman
+ /// // character, so no error occurs.
+ /// let haystack = "foo 123 ☃".as_bytes();
+ /// let expected = Some(HalfMatch::must(0, 7));
+ /// let got = dfa.try_search_fwd(&Input::new(haystack))?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // Notice that this search fails, even though the snowman character
+ /// // occurs after the ending match offset. This is because search
+ /// // routines read one byte past the end of the search to account for
+ /// // look-around, and indeed, this is required here to determine whether
+ /// // the trailing \b matches.
+ /// let haystack = "foo 123 ☃".as_bytes();
+ /// let expected = MatchError::quit(0xE2, 8);
+ /// let got = dfa.try_search_fwd(&Input::new(haystack));
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// // Another example is executing a search where the span of the haystack
+ /// // we specify is all ASCII, but there is non-ASCII just before it. This
+ /// // correctly also reports an error.
+ /// let input = Input::new("β123").range(2..);
+ /// let expected = MatchError::quit(0xB2, 1);
+ /// let got = dfa.try_search_fwd(&input);
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// // And similarly for the trailing word boundary.
+ /// let input = Input::new("123β").range(..3);
+ /// let expected = MatchError::quit(0xCE, 3);
+ /// let got = dfa.try_search_fwd(&input);
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
+ // We have a separate option for this instead of just setting the
+ // appropriate quit bytes here because we don't want to set quit bytes
+ // for every regex. We only want to set them when the regex contains a
+ // Unicode word boundary.
+ self.unicode_word_boundary = Some(yes);
+ self
+ }
+
+ /// Add a "quit" byte to the DFA.
+ ///
+ /// When a quit byte is seen during search time, then search will return
+ /// a [`MatchError::quit`](crate::MatchError::quit) error indicating the
+ /// offset at which the search stopped.
+ ///
+ /// A quit byte will always overrule any other aspects of a regex. For
+ /// example, if the `x` byte is added as a quit byte and the regex `\w` is
+ /// used, then observing `x` will cause the search to quit immediately
+ /// despite the fact that `x` is in the `\w` class.
+ ///
+ /// This mechanism is primarily useful for heuristically enabling certain
+ /// features like Unicode word boundaries in a DFA. Namely, if the input
+ /// to search is ASCII, then a Unicode word boundary can be implemented
+ /// via an ASCII word boundary with no change in semantics. Thus, a DFA
+ /// can attempt to match a Unicode word boundary but give up as soon as it
+ /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
+ /// to be quit bytes, then Unicode word boundaries will be permitted when
+ /// building DFAs. Of course, callers should enable
+ /// [`Config::unicode_word_boundary`] if they want this behavior instead.
+ /// (The advantage being that non-ASCII quit bytes will only be added if a
+ /// Unicode word boundary is in the pattern.)
+ ///
+ /// When enabling this option, callers _must_ be prepared to handle a
+ /// [`MatchError`](crate::MatchError) error during search. When using a
+ /// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the
+ /// `try_` suite of methods.
+ ///
+ /// By default, there are no quit bytes set.
+ ///
+ /// # Panics
+ ///
+ /// This panics if heuristic Unicode word boundaries are enabled and any
+ /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
+ /// Unicode word boundaries requires setting every non-ASCII byte to a quit
+ /// byte. So if the caller attempts to undo any of that, then this will
+ /// panic.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to cause a search to terminate if it sees a
+ /// `\n` byte. This could be useful if, for example, you wanted to prevent
+ /// a user supplied pattern from matching across a line boundary.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchError};
+ ///
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().quit(b'\n', true))
+ /// .build(r"foo\p{any}+bar")?;
+ ///
+ /// let haystack = "foo\nbar".as_bytes();
+ /// // Normally this would produce a match, since \p{any} contains '\n'.
+ /// // But since we instructed the automaton to enter a quit state if a
+ /// // '\n' is observed, this produces a match error instead.
+ /// let expected = MatchError::quit(b'\n', 3);
+ /// let got = dfa.try_search_fwd(&Input::new(haystack)).unwrap_err();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn quit(mut self, byte: u8, yes: bool) -> Config {
+ if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
+ panic!(
+ "cannot set non-ASCII byte to be non-quit when \
+ Unicode word boundaries are enabled"
+ );
+ }
+ if self.quitset.is_none() {
+ self.quitset = Some(ByteSet::empty());
+ }
+ if yes {
+ self.quitset.as_mut().unwrap().add(byte);
+ } else {
+ self.quitset.as_mut().unwrap().remove(byte);
+ }
+ self
+ }
+
+ /// Enable specializing start states in the DFA.
+ ///
+ /// When start states are specialized, an implementor of a search routine
+ /// using a lazy DFA can tell when the search has entered a starting state.
+ /// When start states aren't specialized, then it is impossible to know
+ /// whether the search has entered a start state.
+ ///
+ /// Ideally, this option wouldn't need to exist and we could always
+ /// specialize start states. The problem is that start states can be quite
+ /// active. This in turn means that an efficient search routine is likely
+ /// to ping-pong between a heavily optimized hot loop that handles most
+ /// states and to a less optimized specialized handling of start states.
+ /// This causes branches to get heavily mispredicted and overall can
+ /// materially decrease throughput. Therefore, specializing start states
+ /// should only be enabled when it is needed.
+ ///
+ /// Knowing whether a search is in a start state is typically useful when a
+ /// prefilter is active for the search. A prefilter is typically only run
+ /// when in a start state and a prefilter can greatly accelerate a search.
+ /// Therefore, the possible cost of specializing start states is worth it
+ /// in this case. Otherwise, if you have no prefilter, there is likely no
+ /// reason to specialize start states.
+ ///
+ /// This is disabled by default, but note that it is automatically
+ /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
+ /// `specialize_start_states` has already been set, [`Config::prefilter`]
+ /// will automatically enable or disable it based on whether a prefilter
+ /// is present or not, respectively. This is done because a prefilter's
+ /// effectiveness is rooted in being executed whenever the DFA is in a
+ /// start state, and that's only possible to do when they are specialized.
+ ///
+ /// Note that it is plausibly reasonable to _disable_ this option
+ /// explicitly while _enabling_ a prefilter. In that case, a prefilter
+ /// will still be run at the beginning of a search, but never again. This
+ /// in theory could strike a good balance if you're in a situation where a
+ /// prefilter is likely to produce many false positive candidates.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to enable start state specialization and then
+ /// shows how to check whether a state is a start state or not.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().specialize_start_states(true))
+ /// .build(r"[a-z]+")?;
+ ///
+ /// let haystack = "123 foobar 4567".as_bytes();
+ /// let sid = dfa.start_state_forward(&Input::new(haystack))?;
+ /// // The ID returned by 'start_state_forward' will always be tagged as
+ /// // a start state when start state specialization is enabled.
+ /// assert!(dfa.is_special_state(sid));
+ /// assert!(dfa.is_start_state(sid));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Compare the above with the default DFA configuration where start states
+ /// are _not_ specialized. In this case, the start state is not tagged at
+ /// all:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input};
+ ///
+ /// let dfa = DFA::new(r"[a-z]+")?;
+ ///
+ /// let haystack = "123 foobar 4567";
+ /// let sid = dfa.start_state_forward(&Input::new(haystack))?;
+ /// // Start states are not special in the default configuration!
+ /// assert!(!dfa.is_special_state(sid));
+ /// assert!(!dfa.is_start_state(sid));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn specialize_start_states(mut self, yes: bool) -> Config {
+ self.specialize_start_states = Some(yes);
+ self
+ }
+
+ /// Set a size limit on the total heap used by a DFA.
+ ///
+ /// This size limit is expressed in bytes and is applied during
+ /// determinization of an NFA into a DFA. If the DFA's heap usage, and only
+ /// the DFA, exceeds this configured limit, then determinization is stopped
+ /// and an error is returned.
+ ///
+ /// This limit does not apply to auxiliary storage used during
+ /// determinization that isn't part of the generated DFA.
+ ///
+ /// This limit is only applied during determinization. Currently, there is
+ /// no way to post-pone this check to after minimization if minimization
+ /// was enabled.
+ ///
+ /// The total limit on heap used during determinization is the sum of the
+ /// DFA and determinization size limits.
+ ///
+ /// The default is no limit.
+ ///
+ /// # Example
+ ///
+ /// This example shows a DFA that fails to build because of a configured
+ /// size limit. This particular example also serves as a cautionary tale
+ /// demonstrating just how big DFAs with large Unicode character classes
+ /// can get.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{dense, Automaton}, Input};
+ ///
+ /// // 6MB isn't enough!
+ /// dense::Builder::new()
+ /// .configure(dense::Config::new().dfa_size_limit(Some(6_000_000)))
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 7MB probably is!
+ /// // (Note that DFA sizes aren't necessarily stable between releases.)
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new().dfa_size_limit(Some(7_000_000)))
+ /// .build(r"\w{20}")?;
+ /// let haystack = "A".repeat(20).into_bytes();
+ /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// While one needs a little more than 6MB to represent `\w{20}`, it
+ /// turns out that you only need a little more than 6KB to represent
+ /// `(?-u:\w{20})`. So only use Unicode if you need it!
+ ///
+ /// As with [`Config::determinize_size_limit`], the size of a DFA is
+ /// influenced by other factors, such as what start state configurations
+ /// to support. For example, if you only need unanchored searches and not
+ /// anchored searches, then configuring the DFA to only support unanchored
+ /// searches can reduce its size. By default, DFAs support both unanchored
+ /// and anchored searches.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{dense, Automaton, StartKind}, Input};
+ ///
+ /// // 3MB isn't enough!
+ /// dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .dfa_size_limit(Some(3_000_000))
+ /// .start_kind(StartKind::Unanchored)
+ /// )
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 4MB probably is!
+ /// // (Note that DFA sizes aren't necessarily stable between releases.)
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .dfa_size_limit(Some(4_000_000))
+ /// .start_kind(StartKind::Unanchored)
+ /// )
+ /// .build(r"\w{20}")?;
+ /// let haystack = "A".repeat(20).into_bytes();
+ /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config {
+ self.dfa_size_limit = Some(bytes);
+ self
+ }
+
+ /// Set a size limit on the total heap used by determinization.
+ ///
+ /// This size limit is expressed in bytes and is applied during
+ /// determinization of an NFA into a DFA. If the heap used for auxiliary
+ /// storage during determinization (memory that is not in the DFA but
+ /// necessary for building the DFA) exceeds this configured limit, then
+ /// determinization is stopped and an error is returned.
+ ///
+ /// This limit does not apply to heap used by the DFA itself.
+ ///
+ /// The total limit on heap used during determinization is the sum of the
+ /// DFA and determinization size limits.
+ ///
+ /// The default is no limit.
+ ///
+ /// # Example
+ ///
+ /// This example shows a DFA that fails to build because of a
+ /// configured size limit on the amount of heap space used by
+ /// determinization. This particular example complements the example for
+ /// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode
+ /// potentially make DFAs themselves big, but it also results in more
+ /// auxiliary storage during determinization. (Although, auxiliary storage
+ /// is still not as much as the DFA itself.)
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
+ /// use regex_automata::{dfa::{dense, Automaton}, Input};
+ ///
+ /// // 600KB isn't enough!
+ /// dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .determinize_size_limit(Some(600_000))
+ /// )
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 700KB probably is!
+ /// // (Note that auxiliary storage sizes aren't necessarily stable between
+ /// // releases.)
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .determinize_size_limit(Some(700_000))
+ /// )
+ /// .build(r"\w{20}")?;
+ /// let haystack = "A".repeat(20).into_bytes();
+ /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Note that some parts of the configuration on a DFA can have a
+ /// big impact on how big the DFA is, and thus, how much memory is
+ /// used. For example, the default setting for [`Config::start_kind`] is
+ /// [`StartKind::Both`]. But if you only need an anchored search, for
+ /// example, then it can be much cheaper to build a DFA that only supports
+ /// anchored searches. (Running an unanchored search with it would panic.)
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
+ /// use regex_automata::{
+ /// dfa::{dense, Automaton, StartKind},
+ /// Anchored, Input,
+ /// };
+ ///
+ /// // 200KB isn't enough!
+ /// dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .determinize_size_limit(Some(200_000))
+ /// .start_kind(StartKind::Anchored)
+ /// )
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 300KB probably is!
+ /// // (Note that auxiliary storage sizes aren't necessarily stable between
+ /// // releases.)
+ /// let dfa = dense::Builder::new()
+ /// .configure(dense::Config::new()
+ /// .determinize_size_limit(Some(300_000))
+ /// .start_kind(StartKind::Anchored)
+ /// )
+ /// .build(r"\w{20}")?;
+ /// let haystack = "A".repeat(20).into_bytes();
+ /// let input = Input::new(&haystack).anchored(Anchored::Yes);
+ /// assert!(dfa.try_search_fwd(&input)?.is_some());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn determinize_size_limit(mut self, bytes: Option<usize>) -> Config {
+ self.determinize_size_limit = Some(bytes);
+ self
+ }
+
+ /// Returns whether this configuration has enabled simple state
+ /// acceleration.
+ pub fn get_accelerate(&self) -> bool {
+ self.accelerate.unwrap_or(true)
+ }
+
+ /// Returns the prefilter attached to this configuration, if any.
+ pub fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref().unwrap_or(&None).as_ref()
+ }
+
+ /// Returns whether this configuration has enabled the expensive process
+ /// of minimizing a DFA.
+ pub fn get_minimize(&self) -> bool {
+ self.minimize.unwrap_or(false)
+ }
+
+ /// Returns the match semantics set in this configuration.
+ pub fn get_match_kind(&self) -> MatchKind {
+ self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+ }
+
+ /// Returns the starting state configuration for a DFA.
+ pub fn get_starts(&self) -> StartKind {
+ self.start_kind.unwrap_or(StartKind::Both)
+ }
+
+ /// Returns whether this configuration has enabled anchored starting states
+ /// for every pattern in the DFA.
+ pub fn get_starts_for_each_pattern(&self) -> bool {
+ self.starts_for_each_pattern.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration has enabled byte classes or not.
+ /// This is typically a debugging oriented option, as disabling it confers
+ /// no speed benefit.
+ pub fn get_byte_classes(&self) -> bool {
+ self.byte_classes.unwrap_or(true)
+ }
+
+ /// Returns whether this configuration has enabled heuristic Unicode word
+ /// boundary support. When enabled, it is possible for a search to return
+ /// an error.
+ pub fn get_unicode_word_boundary(&self) -> bool {
+ self.unicode_word_boundary.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration will instruct the DFA to enter a
+ /// quit state whenever the given byte is seen during a search. When at
+ /// least one byte has this enabled, it is possible for a search to return
+ /// an error.
+ pub fn get_quit(&self, byte: u8) -> bool {
+ self.quitset.map_or(false, |q| q.contains(byte))
+ }
+
+ /// Returns whether this configuration will instruct the DFA to
+ /// "specialize" start states. When enabled, the DFA will mark start states
+ /// as "special" so that search routines using the DFA can detect when
+ /// it's in a start state and do some kind of optimization (like run a
+ /// prefilter).
+ pub fn get_specialize_start_states(&self) -> bool {
+ self.specialize_start_states.unwrap_or(false)
+ }
+
+ /// Returns the DFA size limit of this configuration if one was set.
+ /// The size limit is total number of bytes on the heap that a DFA is
+ /// permitted to use. If the DFA exceeds this limit during construction,
+ /// then construction is stopped and an error is returned.
+ pub fn get_dfa_size_limit(&self) -> Option<usize> {
+ self.dfa_size_limit.unwrap_or(None)
+ }
+
+ /// Returns the determinization size limit of this configuration if one
+ /// was set. The size limit is total number of bytes on the heap that
+ /// determinization is permitted to use. If determinization exceeds this
+ /// limit during construction, then construction is stopped and an error is
+ /// returned.
+ ///
+ /// This is different from the DFA size limit in that this only applies to
+ /// the auxiliary storage used during determinization. Once determinization
+ /// is complete, this memory is freed.
+ ///
+ /// The limit on the total heap memory used is the sum of the DFA and
+ /// determinization size limits.
+ pub fn get_determinize_size_limit(&self) -> Option<usize> {
+ self.determinize_size_limit.unwrap_or(None)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
+ Config {
+ accelerate: o.accelerate.or(self.accelerate),
+ pre: o.pre.or_else(|| self.pre.clone()),
+ minimize: o.minimize.or(self.minimize),
+ match_kind: o.match_kind.or(self.match_kind),
+ start_kind: o.start_kind.or(self.start_kind),
+ starts_for_each_pattern: o
+ .starts_for_each_pattern
+ .or(self.starts_for_each_pattern),
+ byte_classes: o.byte_classes.or(self.byte_classes),
+ unicode_word_boundary: o
+ .unicode_word_boundary
+ .or(self.unicode_word_boundary),
+ quitset: o.quitset.or(self.quitset),
+ specialize_start_states: o
+ .specialize_start_states
+ .or(self.specialize_start_states),
+ dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit),
+ determinize_size_limit: o
+ .determinize_size_limit
+ .or(self.determinize_size_limit),
+ }
+ }
+}
+
+/// A builder for constructing a deterministic finite automaton from regular
+/// expressions.
+///
+/// This builder provides two main things:
+///
+/// 1. It provides a few different `build` routines for actually constructing
+/// a DFA from different kinds of inputs. The most convenient is
+/// [`Builder::build`], which builds a DFA directly from a pattern string. The
+/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
+/// from an NFA.
+/// 2. The builder permits configuring a number of things.
+/// [`Builder::configure`] is used with [`Config`] to configure aspects of
+/// the DFA and the construction process itself. [`Builder::syntax`] and
+/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
+/// construction, respectively. The syntax and thompson configurations only
+/// apply when building from a pattern string.
+///
+/// This builder always constructs a *single* DFA. As such, this builder
+/// can only be used to construct regexes that either detect the presence
+/// of a match or find the end location of a match. A single DFA cannot
+/// produce both the start and end of a match. For that information, use a
+/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured
+/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to
+/// use a DFA directly is if the end location of a match is enough for your use
+/// case. Namely, a `Regex` will construct two DFAs instead of one, since a
+/// second reverse DFA is needed to find the start of a match.
+///
+/// Note that if one wants to build a sparse DFA, you must first build a dense
+/// DFA and convert that to a sparse DFA. There is no way to build a sparse
+/// DFA without first building a dense DFA.
+///
+/// # Example
+///
+/// This example shows how to build a minimized DFA that completely disables
+/// Unicode. That is:
+///
+/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
+/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
+/// (instead of any UTF-8 encoding of a Unicode scalar value except for
+/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
+/// * The pattern itself is permitted to match invalid UTF-8. For example,
+/// things like `[^a]` that match any byte except for `a` are permitted.
+///
+/// ```
+/// use regex_automata::{
+/// dfa::{Automaton, dense},
+/// util::syntax,
+/// HalfMatch, Input,
+/// };
+///
+/// let dfa = dense::Builder::new()
+/// .configure(dense::Config::new().minimize(false))
+/// .syntax(syntax::Config::new().unicode(false).utf8(false))
+/// .build(r"foo[^b]ar.*")?;
+///
+/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
+/// let expected = Some(HalfMatch::must(0, 10));
+/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[cfg(feature = "dfa-build")]
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler,
+}
+
+#[cfg(feature = "dfa-build")]
+impl Builder {
+ /// Create a new dense DFA builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler::new(),
+ }
+ }
+
+ /// Build a DFA from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(feature = "syntax")]
+ pub fn build(&self, pattern: &str) -> Result<OwnedDFA, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a DFA from the given patterns.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ #[cfg(feature = "syntax")]
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<OwnedDFA, BuildError> {
+ let nfa = self
+ .thompson
+ .clone()
+ // We can always forcefully disable captures because DFAs do not
+ // support them.
+ .configure(
+ thompson::Config::new()
+ .which_captures(thompson::WhichCaptures::None),
+ )
+ .build_many(patterns)
+ .map_err(BuildError::nfa)?;
+ self.build_from_nfa(&nfa)
+ }
+
+ /// Build a DFA from the given NFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a DFA if you already have an NFA in
+ /// hand.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense},
+ /// nfa::thompson::NFA,
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // This shows how to set non-default options for building an NFA.
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().shrink(true))
+ /// .build(r"[0-9]+")?;
+ /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?;
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.try_search_fwd(&Input::new(haystack))?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_nfa(
+ &self,
+ nfa: &thompson::NFA,
+ ) -> Result<OwnedDFA, BuildError> {
+ let mut quitset = self.config.quitset.unwrap_or(ByteSet::empty());
+ if self.config.get_unicode_word_boundary()
+ && nfa.look_set_any().contains_word_unicode()
+ {
+ for b in 0x80..=0xFF {
+ quitset.add(b);
+ }
+ }
+ let classes = if !self.config.get_byte_classes() {
+ // DFAs will always use the equivalence class map, but enabling
+ // this option is useful for debugging. Namely, this will cause all
+ // transitions to be defined over their actual bytes instead of an
+ // opaque equivalence class identifier. The former is much easier
+ // to grok as a human.
+ ByteClasses::singletons()
+ } else {
+ let mut set = nfa.byte_class_set().clone();
+ // It is important to distinguish any "quit" bytes from all other
+ // bytes. Otherwise, a non-quit byte may end up in the same class
+ // as a quit byte, and thus cause the DFA stop when it shouldn't.
+ //
+ // Test case:
+ //
+ // regex-cli find hybrid regex -w @conn.json.1000x.log \
+ // '^#' '\b10\.55\.182\.100\b'
+ if !quitset.is_empty() {
+ set.add_set(&quitset);
+ }
+ set.byte_classes()
+ };
+
+ let mut dfa = DFA::initial(
+ classes,
+ nfa.pattern_len(),
+ self.config.get_starts(),
+ nfa.look_matcher(),
+ self.config.get_starts_for_each_pattern(),
+ self.config.get_prefilter().map(|p| p.clone()),
+ quitset,
+ Flags::from_nfa(&nfa),
+ )?;
+ determinize::Config::new()
+ .match_kind(self.config.get_match_kind())
+ .quit(quitset)
+ .dfa_size_limit(self.config.get_dfa_size_limit())
+ .determinize_size_limit(self.config.get_determinize_size_limit())
+ .run(nfa, &mut dfa)?;
+ if self.config.get_minimize() {
+ dfa.minimize();
+ }
+ if self.config.get_accelerate() {
+ dfa.accelerate();
+ }
+ // The state shuffling done before this point always assumes that start
+ // states should be marked as "special," even though it isn't the
+ // default configuration. State shuffling is complex enough as it is,
+ // so it's simpler to just "fix" our special state ID ranges to not
+ // include starting states after-the-fact.
+ if !self.config.get_specialize_start_states() {
+ dfa.special.set_no_special_start_states();
+ }
+ // Look for and set the universal starting states.
+ dfa.set_universal_starts();
+ Ok(dfa)
+ }
+
+ /// Apply the given dense DFA configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`syntax::Config`](crate::util::syntax::Config).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// These settings only apply when constructing a DFA directly from a
+ /// pattern.
+ #[cfg(feature = "syntax")]
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::Config,
+ ) -> &mut Builder {
+ self.thompson.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like whether the DFA should match the regex
+ /// in reverse or if additional time should be spent shrinking the size of
+ /// the NFA.
+ ///
+ /// These settings only apply when constructing a DFA directly from a
+ /// pattern.
+ #[cfg(feature = "syntax")]
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.thompson.configure(config);
+ self
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+/// A convenience alias for an owned DFA. We use this particular instantiation
+/// a lot in this crate, so it's worth giving it a name. This instantiation
+/// is commonly used for mutable APIs on the DFA while building it. The main
+/// reason for making DFAs generic is no_std support, and more generally,
+/// making it possible to load a DFA from an arbitrary slice of bytes.
+#[cfg(feature = "alloc")]
+pub(crate) type OwnedDFA = DFA<alloc::vec::Vec<u32>>;
+
+/// A dense table-based deterministic finite automaton (DFA).
+///
+/// All dense DFAs have one or more start states, zero or more match states
+/// and a transition table that maps the current state and the current byte
+/// of input to the next state. A DFA can use this information to implement
+/// fast searching. In particular, the use of a dense DFA generally makes the
+/// trade off that match speed is the most valuable characteristic, even if
+/// building the DFA may take significant time *and* space. (More concretely,
+/// building a DFA takes time and space that is exponential in the size of the
+/// pattern in the worst case.) As such, the processing of every byte of input
+/// is done with a small constant number of operations that does not vary with
+/// the pattern, its size or the size of the alphabet. If your needs don't line
+/// up with this trade off, then a dense DFA may not be an adequate solution to
+/// your problem.
+///
+/// In contrast, a [`sparse::DFA`] makes the opposite
+/// trade off: it uses less space but will execute a variable number of
+/// instructions per byte at match time, which makes it slower for matching.
+/// (Note that space usage is still exponential in the size of the pattern in
+/// the worst case.)
+///
+/// A DFA can be built using the default configuration via the
+/// [`DFA::new`] constructor. Otherwise, one can
+/// configure various aspects via [`dense::Builder`](Builder).
+///
+/// A single DFA fundamentally supports the following operations:
+///
+/// 1. Detection of a match.
+/// 2. Location of the end of a match.
+/// 3. In the case of a DFA with multiple patterns, which pattern matched is
+/// reported as well.
+///
+/// A notable absence from the above list of capabilities is the location of
+/// the *start* of a match. In order to provide both the start and end of
+/// a match, *two* DFAs are required. This functionality is provided by a
+/// [`Regex`](crate::dfa::regex::Regex).
+///
+/// # Type parameters
+///
+/// A `DFA` has one type parameter, `T`, which is used to represent state IDs,
+/// pattern IDs and accelerators. `T` is typically a `Vec<u32>` or a `&[u32]`.
+///
+/// # The `Automaton` trait
+///
+/// This type implements the [`Automaton`] trait, which means it can be used
+/// for searching. For example:
+///
+/// ```
+/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let expected = HalfMatch::must(0, 8);
+/// assert_eq!(Some(expected), dfa.try_search_fwd(&Input::new("foo12345"))?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct DFA<T> {
+ /// The transition table for this DFA. This includes the transitions
+ /// themselves, along with the stride, number of states and the equivalence
+ /// class mapping.
+ tt: TransitionTable<T>,
+ /// The set of starting state identifiers for this DFA. The starting state
+ /// IDs act as pointers into the transition table. The specific starting
+ /// state chosen for each search is dependent on the context at which the
+ /// search begins.
+ st: StartTable<T>,
+ /// The set of match states and the patterns that match for each
+ /// corresponding match state.
+ ///
+ /// This structure is technically only needed because of support for
+ /// multi-regexes. Namely, multi-regexes require answering not just whether
+ /// a match exists, but _which_ patterns match. So we need to store the
+ /// matching pattern IDs for each match state. We do this even when there
+ /// is only one pattern for the sake of simplicity. In practice, this uses
+ /// up very little space for the case of one pattern.
+ ms: MatchStates<T>,
+ /// Information about which states are "special." Special states are states
+ /// that are dead, quit, matching, starting or accelerated. For more info,
+ /// see the docs for `Special`.
+ special: Special,
+ /// The accelerators for this DFA.
+ ///
+ /// If a state is accelerated, then there exist only a small number of
+ /// bytes that can cause the DFA to leave the state. This permits searching
+ /// to use optimized routines to find those specific bytes instead of using
+ /// the transition table.
+ ///
+ /// All accelerated states exist in a contiguous range in the DFA's
+ /// transition table. See dfa/special.rs for more details on how states are
+ /// arranged.
+ accels: Accels<T>,
+ /// Any prefilter attached to this DFA.
+ ///
+ /// Note that currently prefilters are not serialized. When deserializing
+ /// a DFA from bytes, this is always set to `None`.
+ pre: Option<Prefilter>,
+ /// The set of "quit" bytes for this DFA.
+ ///
+ /// This is only used when computing the start state for a particular
+ /// position in a haystack. Namely, in the case where there is a quit
+ /// byte immediately before the start of the search, this set needs to be
+ /// explicitly consulted. In all other cases, quit bytes are detected by
+ /// the DFA itself, by transitioning all quit bytes to a special "quit
+ /// state."
+ quitset: ByteSet,
+ /// Various flags describing the behavior of this DFA.
+ flags: Flags,
+}
+
+#[cfg(feature = "dfa-build")]
+impl OwnedDFA {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding DFA.
+ ///
+ /// If you want a non-default configuration, then use the
+ /// [`dense::Builder`](Builder) to set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
+ ///
+ /// let dfa = dense::DFA::new("foo[0-9]+bar")?;
+ /// let expected = Some(HalfMatch::must(0, 11));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new(pattern: &str) -> Result<OwnedDFA, BuildError> {
+ Builder::new().build(pattern)
+ }
+
+ /// Parse the given regular expressions using a default configuration and
+ /// return the corresponding multi-DFA.
+ ///
+ /// If you want a non-default configuration, then use the
+ /// [`dense::Builder`](Builder) to set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
+ ///
+ /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+ /// let expected = Some(HalfMatch::must(1, 3));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<OwnedDFA, BuildError> {
+ Builder::new().build_many(patterns)
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl OwnedDFA {
+ /// Create a new DFA that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
+ ///
+ /// let dfa = dense::DFA::always_match()?;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 0));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<OwnedDFA, BuildError> {
+ let nfa = thompson::NFA::always_match();
+ Builder::new().build_from_nfa(&nfa)
+ }
+
+ /// Create a new DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, Input};
+ ///
+ /// let dfa = dense::DFA::never_match()?;
+ /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
+ /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<OwnedDFA, BuildError> {
+ let nfa = thompson::NFA::never_match();
+ Builder::new().build_from_nfa(&nfa)
+ }
+
+ /// Create an initial DFA with the given equivalence classes, pattern
+ /// length and whether anchored starting states are enabled for each
+ /// pattern. An initial DFA can be further mutated via determinization.
+ fn initial(
+ classes: ByteClasses,
+ pattern_len: usize,
+ starts: StartKind,
+ lookm: &LookMatcher,
+ starts_for_each_pattern: bool,
+ pre: Option<Prefilter>,
+ quitset: ByteSet,
+ flags: Flags,
+ ) -> Result<OwnedDFA, BuildError> {
+ let start_pattern_len =
+ if starts_for_each_pattern { Some(pattern_len) } else { None };
+ Ok(DFA {
+ tt: TransitionTable::minimal(classes),
+ st: StartTable::dead(starts, lookm, start_pattern_len)?,
+ ms: MatchStates::empty(pattern_len),
+ special: Special::new(),
+ accels: Accels::empty(),
+ pre,
+ quitset,
+ flags,
+ })
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl DFA<&[u32]> {
+ /// Return a new default dense DFA compiler configuration.
+ ///
+ /// This is a convenience routine to avoid needing to import the [`Config`]
+ /// type when customizing the construction of a dense DFA.
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Create a new dense DFA builder with the default configuration.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+impl<T: AsRef<[u32]>> DFA<T> {
+ /// Cheaply return a borrowed version of this dense DFA. Specifically,
+ /// the DFA returned always uses `&[u32]` for its transition table.
+ pub fn as_ref(&self) -> DFA<&'_ [u32]> {
+ DFA {
+ tt: self.tt.as_ref(),
+ st: self.st.as_ref(),
+ ms: self.ms.as_ref(),
+ special: self.special,
+ accels: self.accels(),
+ pre: self.pre.clone(),
+ quitset: self.quitset,
+ flags: self.flags,
+ }
+ }
+
+ /// Return an owned version of this sparse DFA. Specifically, the DFA
+ /// returned always uses `Vec<u32>` for its transition table.
+ ///
+ /// Effectively, this returns a dense DFA whose transition table lives on
+ /// the heap.
+ #[cfg(feature = "alloc")]
+ pub fn to_owned(&self) -> OwnedDFA {
+ DFA {
+ tt: self.tt.to_owned(),
+ st: self.st.to_owned(),
+ ms: self.ms.to_owned(),
+ special: self.special,
+ accels: self.accels().to_owned(),
+ pre: self.pre.clone(),
+ quitset: self.quitset,
+ flags: self.flags,
+ }
+ }
+
+ /// Returns the starting state configuration for this DFA.
+ ///
+ /// The default is [`StartKind::Both`], which means the DFA supports both
+ /// unanchored and anchored searches. However, this can generally lead to
+ /// bigger DFAs. Therefore, a DFA might be compiled with support for just
+ /// unanchored or anchored searches. In that case, running a search with
+ /// an unsupported configuration will panic.
+ pub fn start_kind(&self) -> StartKind {
+ self.st.kind
+ }
+
+ /// Returns the start byte map used for computing the `Start` configuration
+ /// at the beginning of a search.
+ pub(crate) fn start_map(&self) -> &StartByteMap {
+ &self.st.start_map
+ }
+
+ /// Returns true only if this DFA has starting states for each pattern.
+ ///
+ /// When a DFA has starting states for each pattern, then a search with the
+ /// DFA can be configured to only look for anchored matches of a specific
+ /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can
+ /// accept a non-None `pattern_id` if and only if this method returns true.
+ /// Otherwise, calling `try_search_fwd` will panic.
+ ///
+ /// Note that if the DFA has no patterns, this always returns false.
+ pub fn starts_for_each_pattern(&self) -> bool {
+ self.st.pattern_len.is_some()
+ }
+
+ /// Returns the equivalence classes that make up the alphabet for this DFA.
+ ///
+ /// Unless [`Config::byte_classes`] was disabled, it is possible that
+ /// multiple distinct bytes are grouped into the same equivalence class
+ /// if it is impossible for them to discriminate between a match and a
+ /// non-match. This has the effect of reducing the overall alphabet size
+ /// and in turn potentially substantially reducing the size of the DFA's
+ /// transition table.
+ ///
+ /// The downside of using equivalence classes like this is that every state
+ /// transition will automatically use this map to convert an arbitrary
+ /// byte to its corresponding equivalence class. In practice this has a
+ /// negligible impact on performance.
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.tt.classes
+ }
+
+ /// Returns the total number of elements in the alphabet for this DFA.
+ ///
+ /// That is, this returns the total number of transitions that each state
+ /// in this DFA must have. Typically, a normal byte oriented DFA would
+ /// always have an alphabet size of 256, corresponding to the number of
+ /// unique values in a single byte. However, this implementation has two
+ /// peculiarities that impact the alphabet length:
+ ///
+ /// * Every state has a special "EOI" transition that is only followed
+ /// after the end of some haystack is reached. This EOI transition is
+ /// necessary to account for one byte of look-ahead when implementing
+ /// things like `\b` and `$`.
+ /// * Bytes are grouped into equivalence classes such that no two bytes in
+ /// the same class can distinguish a match from a non-match. For example,
+ /// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the
+ /// same equivalence class. This leads to a massive space savings.
+ ///
+ /// Note though that the alphabet length does _not_ necessarily equal the
+ /// total stride space taken up by a single DFA state in the transition
+ /// table. Namely, for performance reasons, the stride is always the
+ /// smallest power of two that is greater than or equal to the alphabet
+ /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
+ /// often more useful. The alphabet length is typically useful only for
+ /// informational purposes.
+ pub fn alphabet_len(&self) -> usize {
+ self.tt.alphabet_len()
+ }
+
+ /// Returns the total stride for every state in this DFA, expressed as the
+ /// exponent of a power of 2. The stride is the amount of space each state
+ /// takes up in the transition table, expressed as a number of transitions.
+ /// (Unused transitions map to dead states.)
+ ///
+ /// The stride of a DFA is always equivalent to the smallest power of 2
+ /// that is greater than or equal to the DFA's alphabet length. This
+ /// definition uses extra space, but permits faster translation between
+ /// premultiplied state identifiers and contiguous indices (by using shifts
+ /// instead of relying on integer division).
+ ///
+ /// For example, if the DFA's stride is 16 transitions, then its `stride2`
+ /// is `4` since `2^4 = 16`.
+ ///
+ /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
+ /// while the maximum `stride2` value is `9` (corresponding to a stride of
+ /// `512`). The maximum is not `8` since the maximum alphabet size is `257`
+ /// when accounting for the special EOI transition. However, an alphabet
+ /// length of that size is exceptionally rare since the alphabet is shrunk
+ /// into equivalence classes.
+ pub fn stride2(&self) -> usize {
+ self.tt.stride2
+ }
+
+ /// Returns the total stride for every state in this DFA. This corresponds
+ /// to the total number of transitions used by each state in this DFA's
+ /// transition table.
+ ///
+ /// Please see [`DFA::stride2`] for more information. In particular, this
+ /// returns the stride as the number of transitions, where as `stride2`
+ /// returns it as the exponent of a power of 2.
+ pub fn stride(&self) -> usize {
+ self.tt.stride()
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, use `std::mem::size_of::<dense::DFA>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.tt.memory_usage()
+ + self.st.memory_usage()
+ + self.ms.memory_usage()
+ + self.accels.memory_usage()
+ }
+}
+
+/// Routines for converting a dense DFA to other representations, such as
+/// sparse DFAs or raw bytes suitable for persistent storage.
+impl<T: AsRef<[u32]>> DFA<T> {
+ /// Convert this dense DFA to a sparse DFA.
+ ///
+ /// If a `StateID` is too small to represent all states in the sparse
+ /// DFA, then this returns an error. In most cases, if a dense DFA is
+ /// constructable with `StateID` then a sparse DFA will be as well.
+ /// However, it is not guaranteed.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
+ ///
+ /// let dense = dense::DFA::new("foo[0-9]+")?;
+ /// let sparse = dense.to_sparse()?;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, sparse.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "dfa-build")]
+ pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, BuildError> {
+ sparse::DFA::from_dense(self)
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
+ /// format. Upon success, the `Vec<u8>` and the initial padding length are
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+ /// an address that does not have the same alignment as `u32`. The padding
+ /// corresponds to the number of leading bytes written to the returned
+ /// `Vec<u8>`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using to_bytes_little_endian would work on a little endian target.
+ /// let (buf, _) = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "dfa-build")]
+ pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) {
+ self.to_bytes::<wire::LE>()
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
+ /// format. Upon success, the `Vec<u8>` and the initial padding length are
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+ /// an address that does not have the same alignment as `u32`. The padding
+ /// corresponds to the number of leading bytes written to the returned
+ /// `Vec<u8>`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using to_bytes_big_endian would work on a big endian target.
+ /// let (buf, _) = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "dfa-build")]
+ pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) {
+ self.to_bytes::<wire::BE>()
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
+ /// format. Upon success, the `Vec<u8>` and the initial padding length are
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// The padding returned is non-zero if the returned `Vec<u8>` starts at
+ /// an address that does not have the same alignment as `u32`. The padding
+ /// corresponds to the number of leading bytes written to the returned
+ /// `Vec<u8>`.
+ ///
+ /// Generally speaking, native endian format should only be used when
+ /// you know that the target you're compiling the DFA for matches the
+ /// endianness of the target on which you're compiling DFA. For example,
+ /// if serialization and deserialization happen in the same process or on
+ /// the same machine. Otherwise, when serializing a DFA for use in a
+ /// portable environment, you'll almost certainly want to serialize _both_
+ /// a little endian and a big endian version and then load the correct one
+ /// based on the target's configuration.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let (buf, _) = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "dfa-build")]
+ pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) {
+ self.to_bytes::<wire::NE>()
+ }
+
+ /// The implementation of the public `to_bytes` serialization methods,
+ /// which is generic over endianness.
+ #[cfg(feature = "dfa-build")]
+ fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) {
+ let len = self.write_to_len();
+ let (mut buf, padding) = wire::alloc_aligned_buffer::<u32>(len);
+ // This should always succeed since the only possible serialization
+ // error is providing a buffer that's too small, but we've ensured that
+ // `buf` is big enough here.
+ self.as_ref().write_to::<E>(&mut buf[padding..]).unwrap();
+ (buf, padding)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in little endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike the various `to_byte_*` routines, this does not write
+ /// any padding. Callers are responsible for handling alignment correctly.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA. We
+ /// // need to use a special type to force the alignment of our [u8; N]
+ /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
+ /// // the DFA may fail because of an alignment mismatch.
+ /// #[repr(C)]
+ /// struct Aligned<B: ?Sized> {
+ /// _align: [u32; 0],
+ /// bytes: B,
+ /// }
+ /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] };
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using write_to_little_endian would work on a little endian target.
+ /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_little_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.as_ref().write_to::<wire::LE>(dst)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in big endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike the various `to_byte_*` routines, this does not write
+ /// any padding. Callers are responsible for handling alignment correctly.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA. We
+ /// // need to use a special type to force the alignment of our [u8; N]
+ /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
+ /// // the DFA may fail because of an alignment mismatch.
+ /// #[repr(C)]
+ /// struct Aligned<B: ?Sized> {
+ /// _align: [u32; 0],
+ /// bytes: B,
+ /// }
+ /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] };
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using write_to_big_endian would work on a big endian target.
+ /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_big_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.as_ref().write_to::<wire::BE>(dst)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in native endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Generally speaking, native endian format should only be used when
+ /// you know that the target you're compiling the DFA for matches the
+ /// endianness of the target on which you're compiling DFA. For example,
+ /// if serialization and deserialization happen in the same process or on
+ /// the same machine. Otherwise, when serializing a DFA for use in a
+ /// portable environment, you'll almost certainly want to serialize _both_
+ /// a little endian and a big endian version and then load the correct one
+ /// based on the target's configuration.
+ ///
+ /// Note that unlike the various `to_byte_*` routines, this does not write
+ /// any padding. Callers are responsible for handling alignment correctly.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA. We
+ /// // need to use a special type to force the alignment of our [u8; N]
+ /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
+ /// // the DFA may fail because of an alignment mismatch.
+ /// #[repr(C)]
+ /// struct Aligned<B: ?Sized> {
+ /// _align: [u32; 0],
+ /// bytes: B,
+ /// }
+ /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] };
+ /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_native_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.as_ref().write_to::<wire::NE>(dst)
+ }
+
+ /// Return the total number of bytes required to serialize this DFA.
+ ///
+ /// This is useful for determining the size of the buffer required to pass
+ /// to one of the serialization routines:
+ ///
+ /// * [`DFA::write_to_little_endian`]
+ /// * [`DFA::write_to_big_endian`]
+ /// * [`DFA::write_to_native_endian`]
+ ///
+ /// Passing a buffer smaller than the size returned by this method will
+ /// result in a serialization error. Serialization routines are guaranteed
+ /// to succeed when the buffer is big enough.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to dynamically allocate enough room to serialize
+ /// a DFA.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let mut buf = vec![0; original_dfa.write_to_len()];
+ /// // This is guaranteed to succeed, because the only serialization error
+ /// // that can occur is when the provided buffer is too small. But
+ /// // write_to_len guarantees a correct size.
+ /// let written = original_dfa.write_to_native_endian(&mut buf).unwrap();
+ /// // But this is not guaranteed to succeed! In particular,
+ /// // deserialization requires proper alignment for &[u32], but our buffer
+ /// // was allocated as a &[u8] whose required alignment is smaller than
+ /// // &[u32]. However, it's likely to work in practice because of how most
+ /// // allocators work. So if you write code like this, make sure to either
+ /// // handle the error correctly and/or run it under Miri since Miri will
+ /// // likely provoke the error by returning Vec<u8> buffers with alignment
+ /// // less than &[u32].
+ /// let dfa: DFA<&[u32]> = match DFA::from_bytes(&buf[..written]) {
+ /// // As mentioned above, it is legal for an error to be returned
+ /// // here. It is quite difficult to get a Vec<u8> with a guaranteed
+ /// // alignment equivalent to Vec<u32>.
+ /// Err(_) => return Ok(()),
+ /// Ok((dfa, _)) => dfa,
+ /// };
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Note that this example isn't actually guaranteed to work! In
+ /// particular, if `buf` is not aligned to a 4-byte boundary, then the
+ /// `DFA::from_bytes` call will fail. If you need this to work, then you
+ /// either need to deal with adding some initial padding yourself, or use
+ /// one of the `to_bytes` methods, which will do it for you.
+ pub fn write_to_len(&self) -> usize {
+ wire::write_label_len(LABEL)
+ + wire::write_endianness_check_len()
+ + wire::write_version_len()
+ + size_of::<u32>() // unused, intended for future flexibility
+ + self.flags.write_to_len()
+ + self.tt.write_to_len()
+ + self.st.write_to_len()
+ + self.ms.write_to_len()
+ + self.special.write_to_len()
+ + self.accels.write_to_len()
+ + self.quitset.write_to_len()
+ }
+}
+
+impl<'a> DFA<&'a [u32]> {
+ /// Safely deserialize a DFA with a specific state identifier
+ /// representation. Upon success, this returns both the deserialized DFA
+ /// and the number of bytes read from the given slice. Namely, the contents
+ /// of the slice beyond the DFA are not read.
+ ///
+ /// Deserializing a DFA using this routine will never allocate heap memory.
+ /// For safety purposes, the DFA's transition table will be verified such
+ /// that every transition points to a valid state. If this verification is
+ /// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
+ /// will always execute in constant time.
+ ///
+ /// The bytes given must be generated by one of the serialization APIs
+ /// of a `DFA` using a semver compatible release of this crate. Those
+ /// include:
+ ///
+ /// * [`DFA::to_bytes_little_endian`]
+ /// * [`DFA::to_bytes_big_endian`]
+ /// * [`DFA::to_bytes_native_endian`]
+ /// * [`DFA::write_to_little_endian`]
+ /// * [`DFA::write_to_big_endian`]
+ /// * [`DFA::write_to_native_endian`]
+ ///
+ /// The `to_bytes` methods allocate and return a `Vec<u8>` for you, along
+ /// with handling alignment correctly. The `write_to` methods do not
+ /// allocate and write to an existing slice (which may be on the stack).
+ /// Since deserialization always uses the native endianness of the target
+ /// platform, the serialization API you use should match the endianness of
+ /// the target platform. (It's often a good idea to generate serialized
+ /// DFAs for both forms of endianness and then load the correct one based
+ /// on endianness.)
+ ///
+ /// # Errors
+ ///
+ /// Generally speaking, it's easier to state the conditions in which an
+ /// error is _not_ returned. All of the following must be true:
+ ///
+ /// * The bytes given must be produced by one of the serialization APIs
+ /// on this DFA, as mentioned above.
+ /// * The endianness of the target platform matches the endianness used to
+ /// serialized the provided DFA.
+ /// * The slice given must have the same alignment as `u32`.
+ ///
+ /// If any of the above are not true, then an error will be returned.
+ ///
+ /// # Panics
+ ///
+ /// This routine will never panic for any input.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize a DFA to raw bytes, deserialize it
+ /// and then use it for searching.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// let (bytes, _) = initial.to_bytes_native_endian();
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: dealing with alignment and padding
+ ///
+ /// In the above example, we used the `to_bytes_native_endian` method to
+ /// serialize a DFA, but we ignored part of its return value corresponding
+ /// to padding added to the beginning of the serialized DFA. This is OK
+ /// because deserialization will skip this initial padding. What matters
+ /// is that the address immediately following the padding has an alignment
+ /// that matches `u32`. That is, the following is an equivalent but
+ /// alternative way to write the above example:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// // Serialization returns the number of leading padding bytes added to
+ /// // the returned Vec<u8>.
+ /// let (bytes, pad) = initial.to_bytes_native_endian();
+ /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This padding is necessary because Rust's standard library does
+ /// not expose any safe and robust way of creating a `Vec<u8>` with a
+ /// guaranteed alignment other than 1. Now, in practice, the underlying
+ /// allocator is likely to provide a `Vec<u8>` that meets our alignment
+ /// requirements, which means `pad` is zero in practice most of the time.
+ ///
+ /// The purpose of exposing the padding like this is flexibility for the
+ /// caller. For example, if one wants to embed a serialized DFA into a
+ /// compiled program, then it's important to guarantee that it starts at a
+ /// `u32`-aligned address. The simplest way to do this is to discard the
+ /// padding bytes and set it up so that the serialized DFA itself begins at
+ /// a properly aligned address. We can show this in two parts. The first
+ /// part is serializing the DFA to a file:
+ ///
+ /// ```no_run
+ /// use regex_automata::dfa::dense::DFA;
+ ///
+ /// let dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let (bytes, pad) = dfa.to_bytes_big_endian();
+ /// // Write the contents of the DFA *without* the initial padding.
+ /// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?;
+ ///
+ /// // Do it again, but this time for little endian.
+ /// let (bytes, pad) = dfa.to_bytes_little_endian();
+ /// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?;
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And now the second part is embedding the DFA into the compiled program
+ /// and deserializing it at runtime on first use. We use conditional
+ /// compilation to choose the correct endianness.
+ ///
+ /// ```no_run
+ /// use regex_automata::{
+ /// dfa::{Automaton, dense::DFA},
+ /// util::{lazy::Lazy, wire::AlignAs},
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// // This crate provides its own "lazy" type, kind of like
+ /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc
+ /// // no-std environments and let's us write this using completely
+ /// // safe code.
+ /// static RE: Lazy<DFA<&'static [u32]>> = Lazy::new(|| {
+ /// # const _: &str = stringify! {
+ /// // This assignment is made possible (implicitly) via the
+ /// // CoerceUnsized trait. This is what guarantees that our
+ /// // bytes are stored in memory on a 4 byte boundary. You
+ /// // *must* do this or something equivalent for correct
+ /// // deserialization.
+ /// static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+ /// _align: [],
+ /// #[cfg(target_endian = "big")]
+ /// bytes: *include_bytes!("foo.bigendian.dfa"),
+ /// #[cfg(target_endian = "little")]
+ /// bytes: *include_bytes!("foo.littleendian.dfa"),
+ /// };
+ /// # };
+ /// # static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
+ /// # _align: [],
+ /// # bytes: [],
+ /// # };
+ ///
+ /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
+ /// .expect("serialized DFA should be valid");
+ /// dfa
+ /// });
+ ///
+ /// let expected = Ok(Some(HalfMatch::must(0, 8)));
+ /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345")));
+ /// ```
+ ///
+ /// An alternative to [`util::lazy::Lazy`](crate::util::lazy::Lazy)
+ /// is [`lazy_static`](https://crates.io/crates/lazy_static) or
+ /// [`once_cell`](https://crates.io/crates/once_cell), which provide
+ /// stronger guarantees (like the initialization function only being
+ /// executed once). And `once_cell` in particular provides a more
+ /// expressive API. But a `Lazy` value from this crate is likely just fine
+ /// in most circumstances.
+ ///
+ /// Note that regardless of which initialization method you use, you
+ /// will still need to use the [`AlignAs`](crate::util::wire::AlignAs)
+ /// trick above to force correct alignment, but this is safe to do and
+ /// `from_bytes` will return an error if you get it wrong.
+ pub fn from_bytes(
+ slice: &'a [u8],
+ ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
+ // SAFETY: This is safe because we validate the transition table, start
+ // table, match states and accelerators below. If any validation fails,
+ // then we return an error.
+ let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
+ dfa.tt.validate(&dfa.special)?;
+ dfa.st.validate(&dfa.tt)?;
+ dfa.ms.validate(&dfa)?;
+ dfa.accels.validate()?;
+ // N.B. dfa.special doesn't have a way to do unchecked deserialization,
+ // so it has already been validated.
+ Ok((dfa, nread))
+ }
+
+ /// Deserialize a DFA with a specific state identifier representation in
+ /// constant time by omitting the verification of the validity of the
+ /// transition table and other data inside the DFA.
+ ///
+ /// This is just like [`DFA::from_bytes`], except it can potentially return
+ /// a DFA that exhibits undefined behavior if its transition table contains
+ /// invalid state identifiers.
+ ///
+ /// This routine is useful if you need to deserialize a DFA cheaply
+ /// and cannot afford the transition table validation performed by
+ /// `from_bytes`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// let (bytes, _) = initial.to_bytes_native_endian();
+ /// // SAFETY: This is guaranteed to be safe since the bytes given come
+ /// // directly from a compatible serialization routine.
+ /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub unsafe fn from_bytes_unchecked(
+ slice: &'a [u8],
+ ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
+ let mut nr = 0;
+
+ nr += wire::skip_initial_padding(slice);
+ wire::check_alignment::<StateID>(&slice[nr..])?;
+ nr += wire::read_label(&slice[nr..], LABEL)?;
+ nr += wire::read_endianness_check(&slice[nr..])?;
+ nr += wire::read_version(&slice[nr..], VERSION)?;
+
+ let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
+ nr += size_of::<u32>();
+
+ let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
+ nr += nread;
+
+ let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (special, nread) = Special::from_bytes(&slice[nr..])?;
+ nr += nread;
+ special.validate_state_len(tt.len(), tt.stride2)?;
+
+ let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
+ nr += nread;
+
+ // Prefilters don't support serialization, so they're always absent.
+ let pre = None;
+ Ok((DFA { tt, st, ms, special, accels, pre, quitset, flags }, nr))
+ }
+
+ /// The implementation of the public `write_to` serialization methods,
+ /// which is generic over endianness.
+ ///
+ /// This is defined only for &[u32] to reduce binary size/compilation time.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("dense DFA"));
+ }
+ dst = &mut dst[..nwrite];
+
+ let mut nw = 0;
+ nw += wire::write_label(LABEL, &mut dst[nw..])?;
+ nw += wire::write_endianness_check::<E>(&mut dst[nw..])?;
+ nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?;
+ nw += {
+ // Currently unused, intended for future flexibility
+ E::write_u32(0, &mut dst[nw..]);
+ size_of::<u32>()
+ };
+ nw += self.flags.write_to::<E>(&mut dst[nw..])?;
+ nw += self.tt.write_to::<E>(&mut dst[nw..])?;
+ nw += self.st.write_to::<E>(&mut dst[nw..])?;
+ nw += self.ms.write_to::<E>(&mut dst[nw..])?;
+ nw += self.special.write_to::<E>(&mut dst[nw..])?;
+ nw += self.accels.write_to::<E>(&mut dst[nw..])?;
+ nw += self.quitset.write_to::<E>(&mut dst[nw..])?;
+ Ok(nw)
+ }
+}
+
+// The following methods implement mutable routines on the internal
+// representation of a DFA. As such, we must fix the first type parameter to a
+// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We
+// can get away with this because these methods are internal to the crate and
+// are exclusively used during construction of the DFA.
+#[cfg(feature = "dfa-build")]
+impl OwnedDFA {
+ /// Add a start state of this DFA.
+ pub(crate) fn set_start_state(
+ &mut self,
+ anchored: Anchored,
+ start: Start,
+ id: StateID,
+ ) {
+ assert!(self.tt.is_valid(id), "invalid start state");
+ self.st.set_start(anchored, start, id);
+ }
+
+ /// Set the given transition to this DFA. Both the `from` and `to` states
+ /// must already exist.
+ pub(crate) fn set_transition(
+ &mut self,
+ from: StateID,
+ byte: alphabet::Unit,
+ to: StateID,
+ ) {
+ self.tt.set(from, byte, to);
+ }
+
+ /// An an empty state (a state where all transitions lead to a dead state)
+ /// and return its identifier. The identifier returned is guaranteed to
+ /// not point to any other existing state.
+ ///
+ /// If adding a state would exceed `StateID::LIMIT`, then this returns an
+ /// error.
+ pub(crate) fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
+ self.tt.add_empty_state()
+ }
+
+ /// Swap the two states given in the transition table.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// swap. Callers must ensure that other states pointing to id1 and id2 are
+ /// updated appropriately.
+ pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ self.tt.swap(id1, id2);
+ }
+
+ /// Remap all of the state identifiers in this DFA according to the map
+ /// function given. This includes all transitions and all starting state
+ /// identifiers.
+ pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ // We could loop over each state ID and call 'remap_state' here, but
+ // this is more direct: just map every transition directly. This
+ // technically might do a little extra work since the alphabet length
+ // is likely less than the stride, but if that is indeed an issue we
+ // should benchmark it and fix it.
+ for sid in self.tt.table_mut().iter_mut() {
+ *sid = map(*sid);
+ }
+ for sid in self.st.table_mut().iter_mut() {
+ *sid = map(*sid);
+ }
+ }
+
+ /// Remap the transitions for the state given according to the function
+ /// given. This applies the given map function to every transition in the
+ /// given state and changes the transition in place to the result of the
+ /// map function for that transition.
+ pub(crate) fn remap_state(
+ &mut self,
+ id: StateID,
+ map: impl Fn(StateID) -> StateID,
+ ) {
+ self.tt.remap(id, map);
+ }
+
+ /// Truncate the states in this DFA to the given length.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// truncation. Callers must ensure that other states pointing to truncated
+ /// states are updated appropriately.
+ pub(crate) fn truncate_states(&mut self, len: usize) {
+ self.tt.truncate(len);
+ }
+
+ /// Minimize this DFA in place using Hopcroft's algorithm.
+ pub(crate) fn minimize(&mut self) {
+ Minimizer::new(self).run();
+ }
+
+ /// Updates the match state pattern ID map to use the one provided.
+ ///
+ /// This is useful when it's convenient to manipulate matching states
+ /// (and their corresponding pattern IDs) as a map. In particular, the
+ /// representation used by a DFA for this map is not amenable to mutation,
+ /// so if things need to be changed (like when shuffling states), it's
+ /// often easier to work with the map form.
+ pub(crate) fn set_pattern_map(
+ &mut self,
+ map: &BTreeMap<StateID, Vec<PatternID>>,
+ ) -> Result<(), BuildError> {
+ self.ms = self.ms.new_with_map(map)?;
+ Ok(())
+ }
+
+ /// Find states that have a small number of non-loop transitions and mark
+ /// them as candidates for acceleration during search.
+ pub(crate) fn accelerate(&mut self) {
+ // dead and quit states can never be accelerated.
+ if self.state_len() <= 2 {
+ return;
+ }
+
+ // Go through every state and record their accelerator, if possible.
+ let mut accels = BTreeMap::new();
+ // Count the number of accelerated match, start and non-match/start
+ // states.
+ let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0);
+ for state in self.states() {
+ if let Some(accel) = state.accelerate(self.byte_classes()) {
+ debug!(
+ "accelerating full DFA state {}: {:?}",
+ state.id().as_usize(),
+ accel,
+ );
+ accels.insert(state.id(), accel);
+ if self.is_match_state(state.id()) {
+ cmatch += 1;
+ } else if self.is_start_state(state.id()) {
+ cstart += 1;
+ } else {
+ assert!(!self.is_dead_state(state.id()));
+ assert!(!self.is_quit_state(state.id()));
+ cnormal += 1;
+ }
+ }
+ }
+ // If no states were able to be accelerated, then we're done.
+ if accels.is_empty() {
+ return;
+ }
+ let original_accels_len = accels.len();
+
+ // A remapper keeps track of state ID changes. Once we're done
+ // shuffling, the remapper is used to rewrite all transitions in the
+ // DFA based on the new positions of states.
+ let mut remapper = Remapper::new(self);
+
+ // As we swap states, if they are match states, we need to swap their
+ // pattern ID lists too (for multi-regexes). We do this by converting
+ // the lists to an easily swappable map, and then convert back to
+ // MatchStates once we're done.
+ let mut new_matches = self.ms.to_map(self);
+
+ // There is at least one state that gets accelerated, so these are
+ // guaranteed to get set to sensible values below.
+ self.special.min_accel = StateID::MAX;
+ self.special.max_accel = StateID::ZERO;
+ let update_special_accel =
+ |special: &mut Special, accel_id: StateID| {
+ special.min_accel = cmp::min(special.min_accel, accel_id);
+ special.max_accel = cmp::max(special.max_accel, accel_id);
+ };
+
+ // Start by shuffling match states. Any match states that are
+ // accelerated get moved to the end of the match state range.
+ if cmatch > 0 && self.special.matches() {
+ // N.B. special.{min,max}_match do not need updating, since the
+ // range/number of match states does not change. Only the ordering
+ // of match states may change.
+ let mut next_id = self.special.max_match;
+ let mut cur_id = next_id;
+ while cur_id >= self.special.min_match {
+ if let Some(accel) = accels.remove(&cur_id) {
+ accels.insert(next_id, accel);
+ update_special_accel(&mut self.special, next_id);
+
+ // No need to do any actual swapping for equivalent IDs.
+ if cur_id != next_id {
+ remapper.swap(self, cur_id, next_id);
+
+ // Swap pattern IDs for match states.
+ let cur_pids = new_matches.remove(&cur_id).unwrap();
+ let next_pids = new_matches.remove(&next_id).unwrap();
+ new_matches.insert(cur_id, next_pids);
+ new_matches.insert(next_id, cur_pids);
+ }
+ next_id = self.tt.prev_state_id(next_id);
+ }
+ cur_id = self.tt.prev_state_id(cur_id);
+ }
+ }
+
+ // This is where it gets tricky. Without acceleration, start states
+ // normally come right after match states. But we want accelerated
+ // states to be a single contiguous range (to make it very fast
+ // to determine whether a state *is* accelerated), while also keeping
+ // match and starting states as contiguous ranges for the same reason.
+ // So what we do here is shuffle states such that it looks like this:
+ //
+ // DQMMMMAAAAASSSSSSNNNNNNN
+ // | |
+ // |---------|
+ // accelerated states
+ //
+ // Where:
+ // D - dead state
+ // Q - quit state
+ // M - match state (may be accelerated)
+ // A - normal state that is accelerated
+ // S - start state (may be accelerated)
+ // N - normal state that is NOT accelerated
+ //
+ // We implement this by shuffling states, which is done by a sequence
+ // of pairwise swaps. We start by looking at all normal states to be
+ // accelerated. When we find one, we swap it with the earliest starting
+ // state, and then swap that with the earliest normal state. This
+ // preserves the contiguous property.
+ //
+ // Once we're done looking for accelerated normal states, now we look
+ // for accelerated starting states by moving them to the beginning
+ // of the starting state range (just like we moved accelerated match
+ // states to the end of the matching state range).
+ //
+ // For a more detailed/different perspective on this, see the docs
+ // in dfa/special.rs.
+ if cnormal > 0 {
+ // our next available starting and normal states for swapping.
+ let mut next_start_id = self.special.min_start;
+ let mut cur_id = self.to_state_id(self.state_len() - 1);
+ // This is guaranteed to exist since cnormal > 0.
+ let mut next_norm_id =
+ self.tt.next_state_id(self.special.max_start);
+ while cur_id >= next_norm_id {
+ if let Some(accel) = accels.remove(&cur_id) {
+ remapper.swap(self, next_start_id, cur_id);
+ remapper.swap(self, next_norm_id, cur_id);
+ // Keep our accelerator map updated with new IDs if the
+ // states we swapped were also accelerated.
+ if let Some(accel2) = accels.remove(&next_norm_id) {
+ accels.insert(cur_id, accel2);
+ }
+ if let Some(accel2) = accels.remove(&next_start_id) {
+ accels.insert(next_norm_id, accel2);
+ }
+ accels.insert(next_start_id, accel);
+ update_special_accel(&mut self.special, next_start_id);
+ // Our start range shifts one to the right now.
+ self.special.min_start =
+ self.tt.next_state_id(self.special.min_start);
+ self.special.max_start =
+ self.tt.next_state_id(self.special.max_start);
+ next_start_id = self.tt.next_state_id(next_start_id);
+ next_norm_id = self.tt.next_state_id(next_norm_id);
+ }
+ // This is pretty tricky, but if our 'next_norm_id' state also
+ // happened to be accelerated, then the result is that it is
+ // now in the position of cur_id, so we need to consider it
+ // again. This loop is still guaranteed to terminate though,
+ // because when accels contains cur_id, we're guaranteed to
+ // increment next_norm_id even if cur_id remains unchanged.
+ if !accels.contains_key(&cur_id) {
+ cur_id = self.tt.prev_state_id(cur_id);
+ }
+ }
+ }
+ // Just like we did for match states, but we want to move accelerated
+ // start states to the beginning of the range instead of the end.
+ if cstart > 0 {
+ // N.B. special.{min,max}_start do not need updating, since the
+ // range/number of start states does not change at this point. Only
+ // the ordering of start states may change.
+ let mut next_id = self.special.min_start;
+ let mut cur_id = next_id;
+ while cur_id <= self.special.max_start {
+ if let Some(accel) = accels.remove(&cur_id) {
+ remapper.swap(self, cur_id, next_id);
+ accels.insert(next_id, accel);
+ update_special_accel(&mut self.special, next_id);
+ next_id = self.tt.next_state_id(next_id);
+ }
+ cur_id = self.tt.next_state_id(cur_id);
+ }
+ }
+
+ // Remap all transitions in our DFA and assert some things.
+ remapper.remap(self);
+ // This unwrap is OK because acceleration never changes the number of
+ // match states or patterns in those match states. Since acceleration
+ // runs after the pattern map has been set at least once, we know that
+ // our match states cannot error.
+ self.set_pattern_map(&new_matches).unwrap();
+ self.special.set_max();
+ self.special.validate().expect("special state ranges should validate");
+ self.special
+ .validate_state_len(self.state_len(), self.stride2())
+ .expect(
+ "special state ranges should be consistent with state length",
+ );
+ assert_eq!(
+ self.special.accel_len(self.stride()),
+ // We record the number of accelerated states initially detected
+ // since the accels map is itself mutated in the process above.
+ // If mutated incorrectly, its size may change, and thus can't be
+ // trusted as a source of truth of how many accelerated states we
+ // expected there to be.
+ original_accels_len,
+ "mismatch with expected number of accelerated states",
+ );
+
+ // And finally record our accelerators. We kept our accels map updated
+ // as we shuffled states above, so the accelerators should now
+ // correspond to a contiguous range in the state ID space. (Which we
+ // assert.)
+ let mut prev: Option<StateID> = None;
+ for (id, accel) in accels {
+ assert!(prev.map_or(true, |p| self.tt.next_state_id(p) == id));
+ prev = Some(id);
+ self.accels.add(accel);
+ }
+ }
+
+ /// Shuffle the states in this DFA so that starting states, match
+ /// states and accelerated states are all contiguous.
+ ///
+ /// See dfa/special.rs for more details.
+ pub(crate) fn shuffle(
+ &mut self,
+ mut matches: BTreeMap<StateID, Vec<PatternID>>,
+ ) -> Result<(), BuildError> {
+ // The determinizer always adds a quit state and it is always second.
+ self.special.quit_id = self.to_state_id(1);
+ // If all we have are the dead and quit states, then we're done and
+ // the DFA will never produce a match.
+ if self.state_len() <= 2 {
+ self.special.set_max();
+ return Ok(());
+ }
+
+ // Collect all our non-DEAD start states into a convenient set and
+ // confirm there is no overlap with match states. In the classicl DFA
+ // construction, start states can be match states. But because of
+ // look-around, we delay all matches by a byte, which prevents start
+ // states from being match states.
+ let mut is_start: BTreeSet<StateID> = BTreeSet::new();
+ for (start_id, _, _) in self.starts() {
+ // If a starting configuration points to a DEAD state, then we
+ // don't want to shuffle it. The DEAD state is always the first
+ // state with ID=0. So we can just leave it be.
+ if start_id == DEAD {
+ continue;
+ }
+ assert!(
+ !matches.contains_key(&start_id),
+ "{:?} is both a start and a match state, which is not allowed",
+ start_id,
+ );
+ is_start.insert(start_id);
+ }
+
+ // We implement shuffling by a sequence of pairwise swaps of states.
+ // Since we have a number of things referencing states via their
+ // IDs and swapping them changes their IDs, we need to record every
+ // swap we make so that we can remap IDs. The remapper handles this
+ // book-keeping for us.
+ let mut remapper = Remapper::new(self);
+
+ // Shuffle matching states.
+ if matches.is_empty() {
+ self.special.min_match = DEAD;
+ self.special.max_match = DEAD;
+ } else {
+ // The determinizer guarantees that the first two states are the
+ // dead and quit states, respectively. We want our match states to
+ // come right after quit.
+ let mut next_id = self.to_state_id(2);
+ let mut new_matches = BTreeMap::new();
+ self.special.min_match = next_id;
+ for (id, pids) in matches {
+ remapper.swap(self, next_id, id);
+ new_matches.insert(next_id, pids);
+ // If we swapped a start state, then update our set.
+ if is_start.contains(&next_id) {
+ is_start.remove(&next_id);
+ is_start.insert(id);
+ }
+ next_id = self.tt.next_state_id(next_id);
+ }
+ matches = new_matches;
+ self.special.max_match = cmp::max(
+ self.special.min_match,
+ self.tt.prev_state_id(next_id),
+ );
+ }
+
+ // Shuffle starting states.
+ {
+ let mut next_id = self.to_state_id(2);
+ if self.special.matches() {
+ next_id = self.tt.next_state_id(self.special.max_match);
+ }
+ self.special.min_start = next_id;
+ for id in is_start {
+ remapper.swap(self, next_id, id);
+ next_id = self.tt.next_state_id(next_id);
+ }
+ self.special.max_start = cmp::max(
+ self.special.min_start,
+ self.tt.prev_state_id(next_id),
+ );
+ }
+
+ // Finally remap all transitions in our DFA.
+ remapper.remap(self);
+ self.set_pattern_map(&matches)?;
+ self.special.set_max();
+ self.special.validate().expect("special state ranges should validate");
+ self.special
+ .validate_state_len(self.state_len(), self.stride2())
+ .expect(
+ "special state ranges should be consistent with state length",
+ );
+ Ok(())
+ }
+
+ /// Checks whether there are universal start states (both anchored and
+ /// unanchored), and if so, sets the relevant fields to the start state
+ /// IDs.
+ ///
+ /// Universal start states occur precisely when the all patterns in the
+ /// DFA have no look-around assertions in their prefix.
+ fn set_universal_starts(&mut self) {
+ assert_eq!(6, Start::len(), "expected 6 start configurations");
+
+ let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| {
+ // This OK because we only call 'start' under conditions
+ // in which we know it will succeed.
+ dfa.st.start(inp, start).expect("valid Input configuration")
+ };
+ if self.start_kind().has_unanchored() {
+ let inp = Input::new("").anchored(Anchored::No);
+ let sid = start_id(self, &inp, Start::NonWordByte);
+ if sid == start_id(self, &inp, Start::WordByte)
+ && sid == start_id(self, &inp, Start::Text)
+ && sid == start_id(self, &inp, Start::LineLF)
+ && sid == start_id(self, &inp, Start::LineCR)
+ && sid == start_id(self, &inp, Start::CustomLineTerminator)
+ {
+ self.st.universal_start_unanchored = Some(sid);
+ }
+ }
+ if self.start_kind().has_anchored() {
+ let inp = Input::new("").anchored(Anchored::Yes);
+ let sid = start_id(self, &inp, Start::NonWordByte);
+ if sid == start_id(self, &inp, Start::WordByte)
+ && sid == start_id(self, &inp, Start::Text)
+ && sid == start_id(self, &inp, Start::LineLF)
+ && sid == start_id(self, &inp, Start::LineCR)
+ && sid == start_id(self, &inp, Start::CustomLineTerminator)
+ {
+ self.st.universal_start_anchored = Some(sid);
+ }
+ }
+ }
+}
+
+// A variety of generic internal methods for accessing DFA internals.
+impl<T: AsRef<[u32]>> DFA<T> {
+ /// Return the info about special states.
+ pub(crate) fn special(&self) -> &Special {
+ &self.special
+ }
+
+ /// Return the info about special states as a mutable borrow.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn special_mut(&mut self) -> &mut Special {
+ &mut self.special
+ }
+
+ /// Returns the quit set (may be empty) used by this DFA.
+ pub(crate) fn quitset(&self) -> &ByteSet {
+ &self.quitset
+ }
+
+ /// Returns the flags for this DFA.
+ pub(crate) fn flags(&self) -> &Flags {
+ &self.flags
+ }
+
+ /// Returns an iterator over all states in this DFA.
+ ///
+ /// This iterator yields a tuple for each state. The first element of the
+ /// tuple corresponds to a state's identifier, and the second element
+ /// corresponds to the state itself (comprised of its transitions).
+ pub(crate) fn states(&self) -> StateIter<'_, T> {
+ self.tt.states()
+ }
+
+ /// Return the total number of states in this DFA. Every DFA has at least
+ /// 1 state, even the empty DFA.
+ pub(crate) fn state_len(&self) -> usize {
+ self.tt.len()
+ }
+
+ /// Return an iterator over all pattern IDs for the given match state.
+ ///
+ /// If the given state is not a match state, then this panics.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] {
+ assert!(self.is_match_state(id));
+ self.ms.pattern_id_slice(self.match_state_index(id))
+ }
+
+ /// Return the total number of pattern IDs for the given match state.
+ ///
+ /// If the given state is not a match state, then this panics.
+ pub(crate) fn match_pattern_len(&self, id: StateID) -> usize {
+ assert!(self.is_match_state(id));
+ self.ms.pattern_len(self.match_state_index(id))
+ }
+
+ /// Returns the total number of patterns matched by this DFA.
+ pub(crate) fn pattern_len(&self) -> usize {
+ self.ms.pattern_len
+ }
+
+ /// Returns a map from match state ID to a list of pattern IDs that match
+ /// in that state.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> {
+ self.ms.to_map(self)
+ }
+
+ /// Returns the ID of the quit state for this DFA.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn quit_id(&self) -> StateID {
+ self.to_state_id(1)
+ }
+
+ /// Convert the given state identifier to the state's index. The state's
+ /// index corresponds to the position in which it appears in the transition
+ /// table. When a DFA is NOT premultiplied, then a state's identifier is
+ /// also its index. When a DFA is premultiplied, then a state's identifier
+ /// is equal to `index * alphabet_len`. This routine reverses that.
+ pub(crate) fn to_index(&self, id: StateID) -> usize {
+ self.tt.to_index(id)
+ }
+
+ /// Convert an index to a state (in the range 0..self.state_len()) to an
+ /// actual state identifier.
+ ///
+ /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+ /// to some other information (such as a remapped state ID).
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn to_state_id(&self, index: usize) -> StateID {
+ self.tt.to_state_id(index)
+ }
+
+ /// Return the table of state IDs for this DFA's start states.
+ pub(crate) fn starts(&self) -> StartStateIter<'_> {
+ self.st.iter()
+ }
+
+ /// Returns the index of the match state for the given ID. If the
+ /// given ID does not correspond to a match state, then this may
+ /// panic or produce an incorrect result.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn match_state_index(&self, id: StateID) -> usize {
+ debug_assert!(self.is_match_state(id));
+ // This is one of the places where we rely on the fact that match
+ // states are contiguous in the transition table. Namely, that the
+ // first match state ID always corresponds to dfa.special.min_match.
+ // From there, since we know the stride, we can compute the overall
+ // index of any match state given the match state's ID.
+ let min = self.special().min_match.as_usize();
+ // CORRECTNESS: We're allowed to produce an incorrect result or panic,
+ // so both the subtraction and the unchecked StateID construction is
+ // OK.
+ self.to_index(StateID::new_unchecked(id.as_usize() - min))
+ }
+
+ /// Returns the index of the accelerator state for the given ID. If the
+ /// given ID does not correspond to an accelerator state, then this may
+ /// panic or produce an incorrect result.
+ fn accelerator_index(&self, id: StateID) -> usize {
+ let min = self.special().min_accel.as_usize();
+ // CORRECTNESS: We're allowed to produce an incorrect result or panic,
+ // so both the subtraction and the unchecked StateID construction is
+ // OK.
+ self.to_index(StateID::new_unchecked(id.as_usize() - min))
+ }
+
+ /// Return the accelerators for this DFA.
+ fn accels(&self) -> Accels<&[u32]> {
+ self.accels.as_ref()
+ }
+
+ /// Return this DFA's transition table as a slice.
+ fn trans(&self) -> &[StateID] {
+ self.tt.table()
+ }
+}
+
+impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ writeln!(f, "dense::DFA(")?;
+ for state in self.states() {
+ fmt_state_indicator(f, self, state.id())?;
+ let id = if f.alternate() {
+ state.id().as_usize()
+ } else {
+ self.to_index(state.id())
+ };
+ write!(f, "{:06?}: ", id)?;
+ state.fmt(f)?;
+ write!(f, "\n")?;
+ }
+ writeln!(f, "")?;
+ for (i, (start_id, anchored, sty)) in self.starts().enumerate() {
+ let id = if f.alternate() {
+ start_id.as_usize()
+ } else {
+ self.to_index(start_id)
+ };
+ if i % self.st.stride == 0 {
+ match anchored {
+ Anchored::No => writeln!(f, "START-GROUP(unanchored)")?,
+ Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?,
+ Anchored::Pattern(pid) => {
+ writeln!(f, "START_GROUP(pattern: {:?})", pid)?
+ }
+ }
+ }
+ writeln!(f, " {:?} => {:06?}", sty, id)?;
+ }
+ if self.pattern_len() > 1 {
+ writeln!(f, "")?;
+ for i in 0..self.ms.len() {
+ let id = self.ms.match_state_id(self, i);
+ let id = if f.alternate() {
+ id.as_usize()
+ } else {
+ self.to_index(id)
+ };
+ write!(f, "MATCH({:06?}): ", id)?;
+ for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate()
+ {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?}", pid)?;
+ }
+ writeln!(f, "")?;
+ }
+ }
+ writeln!(f, "state length: {:?}", self.state_len())?;
+ writeln!(f, "pattern length: {:?}", self.pattern_len())?;
+ writeln!(f, "flags: {:?}", self.flags)?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+// SAFETY: We assert that our implementation of each method is correct.
+unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_special_state(&self, id: StateID) -> bool {
+ self.special.is_special_state(id)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_dead_state(&self, id: StateID) -> bool {
+ self.special.is_dead_state(id)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_quit_state(&self, id: StateID) -> bool {
+ self.special.is_quit_state(id)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match_state(&self, id: StateID) -> bool {
+ self.special.is_match_state(id)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_start_state(&self, id: StateID) -> bool {
+ self.special.is_start_state(id)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_accel_state(&self, id: StateID) -> bool {
+ self.special.is_accel_state(id)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn next_state(&self, current: StateID, input: u8) -> StateID {
+ let input = self.byte_classes().get(input);
+ let o = current.as_usize() + usize::from(input);
+ self.trans()[o]
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ unsafe fn next_state_unchecked(
+ &self,
+ current: StateID,
+ byte: u8,
+ ) -> StateID {
+ // We don't (or shouldn't) need an unchecked variant for the byte
+ // class mapping, since bound checks should be omitted automatically
+ // by virtue of its representation. If this ends up not being true as
+ // confirmed by codegen, please file an issue. ---AG
+ let class = self.byte_classes().get(byte);
+ let o = current.as_usize() + usize::from(class);
+ let next = *self.trans().get_unchecked(o);
+ next
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn next_eoi_state(&self, current: StateID) -> StateID {
+ let eoi = self.byte_classes().eoi().as_usize();
+ let o = current.as_usize() + eoi;
+ self.trans()[o]
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn pattern_len(&self) -> usize {
+ self.ms.pattern_len
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn match_len(&self, id: StateID) -> usize {
+ self.match_pattern_len(id)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
+ // This is an optimization for the very common case of a DFA with a
+ // single pattern. This conditional avoids a somewhat more costly path
+ // that finds the pattern ID from the state machine, which requires
+ // a bit of slicing/pointer-chasing. This optimization tends to only
+ // matter when matches are frequent.
+ if self.ms.pattern_len == 1 {
+ return PatternID::ZERO;
+ }
+ let state_index = self.match_state_index(id);
+ self.ms.pattern_id(state_index, match_index)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn has_empty(&self) -> bool {
+ self.flags.has_empty
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_utf8(&self) -> bool {
+ self.flags.is_utf8
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_always_start_anchored(&self) -> bool {
+ self.flags.is_always_start_anchored
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn start_state_forward(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ if !self.quitset.is_empty() && input.start() > 0 {
+ let offset = input.start() - 1;
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start = self.st.start_map.fwd(&input);
+ self.st.start(input, start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn start_state_reverse(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ if !self.quitset.is_empty() && input.end() < input.haystack().len() {
+ let offset = input.end();
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start = self.st.start_map.rev(&input);
+ self.st.start(input, start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
+ match mode {
+ Anchored::No => self.st.universal_start_unanchored,
+ Anchored::Yes => self.st.universal_start_anchored,
+ Anchored::Pattern(_) => None,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn accelerator(&self, id: StateID) -> &[u8] {
+ if !self.is_accel_state(id) {
+ return &[];
+ }
+ self.accels.needles(self.accelerator_index(id))
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref()
+ }
+}
+
+/// The transition table portion of a dense DFA.
+///
+/// The transition table is the core part of the DFA in that it describes how
+/// to move from one state to another based on the input sequence observed.
+#[derive(Clone)]
+pub(crate) struct TransitionTable<T> {
+ /// A contiguous region of memory representing the transition table in
+ /// row-major order. The representation is dense. That is, every state
+ /// has precisely the same number of transitions. The maximum number of
+ /// transitions per state is 257 (256 for each possible byte value, plus 1
+ /// for the special EOI transition). If a DFA has been instructed to use
+ /// byte classes (the default), then the number of transitions is usually
+ /// substantially fewer.
+ ///
+ /// In practice, T is either `Vec<u32>` or `&[u32]`.
+ table: T,
+ /// A set of equivalence classes, where a single equivalence class
+ /// represents a set of bytes that never discriminate between a match
+ /// and a non-match in the DFA. Each equivalence class corresponds to a
+ /// single character in this DFA's alphabet, where the maximum number of
+ /// characters is 257 (each possible value of a byte plus the special
+ /// EOI transition). Consequently, the number of equivalence classes
+ /// corresponds to the number of transitions for each DFA state. Note
+ /// though that the *space* used by each DFA state in the transition table
+ /// may be larger. The total space used by each DFA state is known as the
+ /// stride.
+ ///
+ /// The only time the number of equivalence classes is fewer than 257 is if
+ /// the DFA's kind uses byte classes (which is the default). Equivalence
+ /// classes should generally only be disabled when debugging, so that
+ /// the transitions themselves aren't obscured. Disabling them has no
+ /// other benefit, since the equivalence class map is always used while
+ /// searching. In the vast majority of cases, the number of equivalence
+ /// classes is substantially smaller than 257, particularly when large
+ /// Unicode classes aren't used.
+ classes: ByteClasses,
+ /// The stride of each DFA state, expressed as a power-of-two exponent.
+ ///
+ /// The stride of a DFA corresponds to the total amount of space used by
+ /// each DFA state in the transition table. This may be bigger than the
+ /// size of a DFA's alphabet, since the stride is always the smallest
+ /// power of two greater than or equal to the alphabet size.
+ ///
+ /// While this wastes space, this avoids the need for integer division
+ /// to convert between premultiplied state IDs and their corresponding
+ /// indices. Instead, we can use simple bit-shifts.
+ ///
+ /// See the docs for the `stride2` method for more details.
+ ///
+ /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
+ /// while the maximum `stride2` value is `9` (corresponding to a stride of
+ /// `512`). The maximum is not `8` since the maximum alphabet size is `257`
+ /// when accounting for the special EOI transition. However, an alphabet
+ /// length of that size is exceptionally rare since the alphabet is shrunk
+ /// into equivalence classes.
+ stride2: usize,
+}
+
+impl<'a> TransitionTable<&'a [u32]> {
+ /// Deserialize a transition table starting at the beginning of `slice`.
+ /// Upon success, return the total number of bytes read along with the
+ /// transition table.
+ ///
+ /// If there was a problem deserializing any part of the transition table,
+ /// then this returns an error. Notably, if the given slice does not have
+ /// the same alignment as `StateID`, then this will return an error (among
+ /// other possible errors).
+ ///
+ /// This is guaranteed to execute in constant time.
+ ///
+ /// # Safety
+ ///
+ /// This routine is not safe because it does not check the validity of the
+ /// transition table itself. In particular, the transition table can be
+ /// quite large, so checking its validity can be somewhat expensive. An
+ /// invalid transition table is not safe because other code may rely on the
+ /// transition table being correct (such as explicit bounds check elision).
+ /// Therefore, an invalid transition table can lead to undefined behavior.
+ ///
+ /// Callers that use this function must either pass on the safety invariant
+ /// or guarantee that the bytes given contain a valid transition table.
+ /// This guarantee is upheld by the bytes written by `write_to`.
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr().as_usize();
+
+ let (state_len, nr) =
+ wire::try_read_u32_as_usize(slice, "state length")?;
+ slice = &slice[nr..];
+
+ let (stride2, nr) = wire::try_read_u32_as_usize(slice, "stride2")?;
+ slice = &slice[nr..];
+
+ let (classes, nr) = ByteClasses::from_bytes(slice)?;
+ slice = &slice[nr..];
+
+ // The alphabet length (determined by the byte class map) cannot be
+ // bigger than the stride (total space used by each DFA state).
+ if stride2 > 9 {
+ return Err(DeserializeError::generic(
+ "dense DFA has invalid stride2 (too big)",
+ ));
+ }
+ // It also cannot be zero, since even a DFA that never matches anything
+ // has a non-zero number of states with at least two equivalence
+ // classes: one for all 256 byte values and another for the EOI
+ // sentinel.
+ if stride2 < 1 {
+ return Err(DeserializeError::generic(
+ "dense DFA has invalid stride2 (too small)",
+ ));
+ }
+ // This is OK since 1 <= stride2 <= 9.
+ let stride =
+ 1usize.checked_shl(u32::try_from(stride2).unwrap()).unwrap();
+ if classes.alphabet_len() > stride {
+ return Err(DeserializeError::generic(
+ "alphabet size cannot be bigger than transition table stride",
+ ));
+ }
+
+ let trans_len =
+ wire::shl(state_len, stride2, "dense table transition length")?;
+ let table_bytes_len = wire::mul(
+ trans_len,
+ StateID::SIZE,
+ "dense table state byte length",
+ )?;
+ wire::check_slice_len(slice, table_bytes_len, "transition table")?;
+ wire::check_alignment::<StateID>(slice)?;
+ let table_bytes = &slice[..table_bytes_len];
+ slice = &slice[table_bytes_len..];
+ // SAFETY: Since StateID is always representable as a u32, all we need
+ // to do is ensure that we have the proper length and alignment. We've
+ // checked both above, so the cast below is safe.
+ //
+ // N.B. This is the only not-safe code in this function.
+ let table = core::slice::from_raw_parts(
+ table_bytes.as_ptr().cast::<u32>(),
+ trans_len,
+ );
+ let tt = TransitionTable { table, classes, stride2 };
+ Ok((tt, slice.as_ptr().as_usize() - slice_start))
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl TransitionTable<Vec<u32>> {
+ /// Create a minimal transition table with just two states: a dead state
+ /// and a quit state. The alphabet length and stride of the transition
+ /// table is determined by the given set of equivalence classes.
+ fn minimal(classes: ByteClasses) -> TransitionTable<Vec<u32>> {
+ let mut tt = TransitionTable {
+ table: vec![],
+ classes,
+ stride2: classes.stride2(),
+ };
+ // Two states, regardless of alphabet size, can always fit into u32.
+ tt.add_empty_state().unwrap(); // dead state
+ tt.add_empty_state().unwrap(); // quit state
+ tt
+ }
+
+ /// Set a transition in this table. Both the `from` and `to` states must
+ /// already exist, otherwise this panics. `unit` should correspond to the
+ /// transition out of `from` to set to `to`.
+ fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) {
+ assert!(self.is_valid(from), "invalid 'from' state");
+ assert!(self.is_valid(to), "invalid 'to' state");
+ self.table[from.as_usize() + self.classes.get_by_unit(unit)] =
+ to.as_u32();
+ }
+
+ /// Add an empty state (a state where all transitions lead to a dead state)
+ /// and return its identifier. The identifier returned is guaranteed to
+ /// not point to any other existing state.
+ ///
+ /// If adding a state would exhaust the state identifier space, then this
+ /// returns an error.
+ fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
+ // Normally, to get a fresh state identifier, we would just
+ // take the index of the next state added to the transition
+ // table. However, we actually perform an optimization here
+ // that premultiplies state IDs by the stride, such that they
+ // point immediately at the beginning of their transitions in
+ // the transition table. This avoids an extra multiplication
+ // instruction for state lookup at search time.
+ //
+ // Premultiplied identifiers means that instead of your matching
+ // loop looking something like this:
+ //
+ // state = dfa.start
+ // for byte in haystack:
+ // next = dfa.transitions[state * stride + byte]
+ // if dfa.is_match(next):
+ // return true
+ // return false
+ //
+ // it can instead look like this:
+ //
+ // state = dfa.start
+ // for byte in haystack:
+ // next = dfa.transitions[state + byte]
+ // if dfa.is_match(next):
+ // return true
+ // return false
+ //
+ // In other words, we save a multiplication instruction in the
+ // critical path. This turns out to be a decent performance win.
+ // The cost of using premultiplied state ids is that they can
+ // require a bigger state id representation. (And they also make
+ // the code a bit more complex, especially during minimization and
+ // when reshuffling states, as one needs to convert back and forth
+ // between state IDs and state indices.)
+ //
+ // To do this, we simply take the index of the state into the
+ // entire transition table, rather than the index of the state
+ // itself. e.g., If the stride is 64, then the ID of the 3rd state
+ // is 192, not 2.
+ let next = self.table.len();
+ let id =
+ StateID::new(next).map_err(|_| BuildError::too_many_states())?;
+ self.table.extend(iter::repeat(0).take(self.stride()));
+ Ok(id)
+ }
+
+ /// Swap the two states given in this transition table.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// swap. Callers must ensure that other states pointing to id1 and id2 are
+ /// updated appropriately.
+ ///
+ /// Both id1 and id2 must point to valid states, otherwise this panics.
+ fn swap(&mut self, id1: StateID, id2: StateID) {
+ assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1);
+ assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2);
+ // We only need to swap the parts of the state that are used. So if the
+ // stride is 64, but the alphabet length is only 33, then we save a lot
+ // of work.
+ for b in 0..self.classes.alphabet_len() {
+ self.table.swap(id1.as_usize() + b, id2.as_usize() + b);
+ }
+ }
+
+ /// Remap the transitions for the state given according to the function
+ /// given. This applies the given map function to every transition in the
+ /// given state and changes the transition in place to the result of the
+ /// map function for that transition.
+ fn remap(&mut self, id: StateID, map: impl Fn(StateID) -> StateID) {
+ for byte in 0..self.alphabet_len() {
+ let i = id.as_usize() + byte;
+ let next = self.table()[i];
+ self.table_mut()[id.as_usize() + byte] = map(next);
+ }
+ }
+
+ /// Truncate the states in this transition table to the given length.
+ ///
+ /// This routine does not do anything to check the correctness of this
+ /// truncation. Callers must ensure that other states pointing to truncated
+ /// states are updated appropriately.
+ fn truncate(&mut self, len: usize) {
+ self.table.truncate(len << self.stride2);
+ }
+}
+
+impl<T: AsRef<[u32]>> TransitionTable<T> {
+ /// Writes a serialized form of this transition table to the buffer given.
+ /// If the buffer is too small, then an error is returned. To determine
+ /// how big the buffer must be, use `write_to_len`.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("transition table"));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write state length
+ // Unwrap is OK since number of states is guaranteed to fit in a u32.
+ E::write_u32(u32::try_from(self.len()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write state stride (as power of 2)
+ // Unwrap is OK since stride2 is guaranteed to be <= 9.
+ E::write_u32(u32::try_from(self.stride2).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write byte class map
+ let n = self.classes.write_to(dst)?;
+ dst = &mut dst[n..];
+
+ // write actual transitions
+ for &sid in self.table() {
+ let n = wire::write_state_id::<E>(sid, &mut dst);
+ dst = &mut dst[n..];
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this transition
+ /// table will use.
+ fn write_to_len(&self) -> usize {
+ size_of::<u32>() // state length
+ + size_of::<u32>() // stride2
+ + self.classes.write_to_len()
+ + (self.table().len() * StateID::SIZE)
+ }
+
+ /// Validates that every state ID in this transition table is valid.
+ ///
+ /// That is, every state ID can be used to correctly index a state in this
+ /// table.
+ fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
+ for state in self.states() {
+ // We check that the ID itself is well formed. That is, if it's
+ // a special state then it must actually be a quit, dead, accel,
+ // match or start state.
+ if sp.is_special_state(state.id()) {
+ let is_actually_special = sp.is_dead_state(state.id())
+ || sp.is_quit_state(state.id())
+ || sp.is_match_state(state.id())
+ || sp.is_start_state(state.id())
+ || sp.is_accel_state(state.id());
+ if !is_actually_special {
+ // This is kind of a cryptic error message...
+ return Err(DeserializeError::generic(
+ "found dense state tagged as special but \
+ wasn't actually special",
+ ));
+ }
+ }
+ for (_, to) in state.transitions() {
+ if !self.is_valid(to) {
+ return Err(DeserializeError::generic(
+ "found invalid state ID in transition table",
+ ));
+ }
+ }
+ }
+ Ok(())
+ }
+
+ /// Converts this transition table to a borrowed value.
+ fn as_ref(&self) -> TransitionTable<&'_ [u32]> {
+ TransitionTable {
+ table: self.table.as_ref(),
+ classes: self.classes.clone(),
+ stride2: self.stride2,
+ }
+ }
+
+ /// Converts this transition table to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> TransitionTable<alloc::vec::Vec<u32>> {
+ TransitionTable {
+ table: self.table.as_ref().to_vec(),
+ classes: self.classes.clone(),
+ stride2: self.stride2,
+ }
+ }
+
+ /// Return the state for the given ID. If the given ID is not valid, then
+ /// this panics.
+ fn state(&self, id: StateID) -> State<'_> {
+ assert!(self.is_valid(id));
+
+ let i = id.as_usize();
+ State {
+ id,
+ stride2: self.stride2,
+ transitions: &self.table()[i..i + self.alphabet_len()],
+ }
+ }
+
+ /// Returns an iterator over all states in this transition table.
+ ///
+ /// This iterator yields a tuple for each state. The first element of the
+ /// tuple corresponds to a state's identifier, and the second element
+ /// corresponds to the state itself (comprised of its transitions).
+ fn states(&self) -> StateIter<'_, T> {
+ StateIter {
+ tt: self,
+ it: self.table().chunks(self.stride()).enumerate(),
+ }
+ }
+
+ /// Convert a state identifier to an index to a state (in the range
+ /// 0..self.len()).
+ ///
+ /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+ /// to some other information (such as a remapped state ID).
+ ///
+ /// If the given ID is not valid, then this may panic or produce an
+ /// incorrect index.
+ fn to_index(&self, id: StateID) -> usize {
+ id.as_usize() >> self.stride2
+ }
+
+ /// Convert an index to a state (in the range 0..self.len()) to an actual
+ /// state identifier.
+ ///
+ /// This is useful when using a `Vec<T>` as an efficient map keyed by state
+ /// to some other information (such as a remapped state ID).
+ ///
+ /// If the given index is not in the specified range, then this may panic
+ /// or produce an incorrect state ID.
+ fn to_state_id(&self, index: usize) -> StateID {
+ // CORRECTNESS: If the given index is not valid, then it is not
+ // required for this to panic or return a valid state ID.
+ StateID::new_unchecked(index << self.stride2)
+ }
+
+ /// Returns the state ID for the state immediately following the one given.
+ ///
+ /// This does not check whether the state ID returned is invalid. In fact,
+ /// if the state ID given is the last state in this DFA, then the state ID
+ /// returned is guaranteed to be invalid.
+ #[cfg(feature = "dfa-build")]
+ fn next_state_id(&self, id: StateID) -> StateID {
+ self.to_state_id(self.to_index(id).checked_add(1).unwrap())
+ }
+
+ /// Returns the state ID for the state immediately preceding the one given.
+ ///
+ /// If the dead ID given (which is zero), then this panics.
+ #[cfg(feature = "dfa-build")]
+ fn prev_state_id(&self, id: StateID) -> StateID {
+ self.to_state_id(self.to_index(id).checked_sub(1).unwrap())
+ }
+
+ /// Returns the table as a slice of state IDs.
+ fn table(&self) -> &[StateID] {
+ wire::u32s_to_state_ids(self.table.as_ref())
+ }
+
+ /// Returns the total number of states in this transition table.
+ ///
+ /// Note that a DFA always has at least two states: the dead and quit
+ /// states. In particular, the dead state always has ID 0 and is
+ /// correspondingly always the first state. The dead state is never a match
+ /// state.
+ fn len(&self) -> usize {
+ self.table().len() >> self.stride2
+ }
+
+ /// Returns the total stride for every state in this DFA. This corresponds
+ /// to the total number of transitions used by each state in this DFA's
+ /// transition table.
+ fn stride(&self) -> usize {
+ 1 << self.stride2
+ }
+
+ /// Returns the total number of elements in the alphabet for this
+ /// transition table. This is always less than or equal to `self.stride()`.
+ /// It is only equal when the alphabet length is a power of 2. Otherwise,
+ /// it is always strictly less.
+ fn alphabet_len(&self) -> usize {
+ self.classes.alphabet_len()
+ }
+
+ /// Returns true if and only if the given state ID is valid for this
+ /// transition table. Validity in this context means that the given ID can
+ /// be used as a valid offset with `self.stride()` to index this transition
+ /// table.
+ fn is_valid(&self, id: StateID) -> bool {
+ let id = id.as_usize();
+ id < self.table().len() && id % self.stride() == 0
+ }
+
+ /// Return the memory usage, in bytes, of this transition table.
+ ///
+ /// This does not include the size of a `TransitionTable` value itself.
+ fn memory_usage(&self) -> usize {
+ self.table().len() * StateID::SIZE
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl<T: AsMut<[u32]>> TransitionTable<T> {
+ /// Returns the table as a slice of state IDs.
+ fn table_mut(&mut self) -> &mut [StateID] {
+ wire::u32s_to_state_ids_mut(self.table.as_mut())
+ }
+}
+
+/// The set of all possible starting states in a DFA.
+///
+/// The set of starting states corresponds to the possible choices one can make
+/// in terms of starting a DFA. That is, before following the first transition,
+/// you first need to select the state that you start in.
+///
+/// Normally, a DFA converted from an NFA that has a single starting state
+/// would itself just have one starting state. However, our support for look
+/// around generally requires more starting states. The correct starting state
+/// is chosen based on certain properties of the position at which we begin
+/// our search.
+///
+/// Before listing those properties, we first must define two terms:
+///
+/// * `haystack` - The bytes to execute the search. The search always starts
+/// at the beginning of `haystack` and ends before or at the end of
+/// `haystack`.
+/// * `context` - The (possibly empty) bytes surrounding `haystack`. `haystack`
+/// must be contained within `context` such that `context` is at least as big
+/// as `haystack`.
+///
+/// This split is crucial for dealing with look-around. For example, consider
+/// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This
+/// regex should _not_ match the haystack since `bar` does not appear at the
+/// beginning of the input. Similarly, the regex `\Bbar\B` should match the
+/// haystack because `bar` is not surrounded by word boundaries. But a search
+/// that does not take context into account would not permit `\B` to match
+/// since the beginning of any string matches a word boundary. Similarly, a
+/// search that does not take context into account when searching `^bar$` in
+/// the haystack `bar` would produce a match when it shouldn't.
+///
+/// Thus, it follows that the starting state is chosen based on the following
+/// criteria, derived from the position at which the search starts in the
+/// `context` (corresponding to the start of `haystack`):
+///
+/// 1. If the search starts at the beginning of `context`, then the `Text`
+/// start state is used. (Since `^` corresponds to
+/// `hir::Anchor::Start`.)
+/// 2. If the search starts at a position immediately following a line
+/// terminator, then the `Line` start state is used. (Since `(?m:^)`
+/// corresponds to `hir::Anchor::StartLF`.)
+/// 3. If the search starts at a position immediately following a byte
+/// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte`
+/// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.)
+/// 4. Otherwise, if the search starts at a position immediately following
+/// a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`),
+/// then the `NonWordByte` start state is used. (Since `(?-u:\B)`
+/// corresponds to a not-word-boundary.)
+///
+/// (N.B. Unicode word boundaries are not supported by the DFA because they
+/// require multi-byte look-around and this is difficult to support in a DFA.)
+///
+/// To further complicate things, we also support constructing individual
+/// anchored start states for each pattern in the DFA. (Which is required to
+/// implement overlapping regexes correctly, but is also generally useful.)
+/// Thus, when individual start states for each pattern are enabled, then the
+/// total number of start states represented is `4 + (4 * #patterns)`, where
+/// the 4 comes from each of the 4 possibilities above. The first 4 represents
+/// the starting states for the entire DFA, which support searching for
+/// multiple patterns simultaneously (possibly unanchored).
+///
+/// If individual start states are disabled, then this will only store 4
+/// start states. Typically, individual start states are only enabled when
+/// constructing the reverse DFA for regex matching. But they are also useful
+/// for building DFAs that can search for a specific pattern or even to support
+/// both anchored and unanchored searches with the same DFA.
+///
+/// Note though that while the start table always has either `4` or
+/// `4 + (4 * #patterns)` starting state *ids*, the total number of states
+/// might be considerably smaller. That is, many of the IDs may be duplicative.
+/// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no
+/// reason to generate a unique starting state for handling word boundaries.
+/// Similarly for start/end anchors.)
+#[derive(Clone)]
+pub(crate) struct StartTable<T> {
+ /// The initial start state IDs.
+ ///
+ /// In practice, T is either `Vec<u32>` or `&[u32]`.
+ ///
+ /// The first `2 * stride` (currently always 8) entries always correspond
+ /// to the starts states for the entire DFA, with the first 4 entries being
+ /// for unanchored searches and the second 4 entries being for anchored
+ /// searches. To keep things simple, we always use 8 entries even if the
+ /// `StartKind` is not both.
+ ///
+ /// After that, there are `stride * patterns` state IDs, where `patterns`
+ /// may be zero in the case of a DFA with no patterns or in the case where
+ /// the DFA was built without enabling starting states for each pattern.
+ table: T,
+ /// The starting state configuration supported. When 'both', both
+ /// unanchored and anchored searches work. When 'unanchored', anchored
+ /// searches panic. When 'anchored', unanchored searches panic.
+ kind: StartKind,
+ /// The start state configuration for every possible byte.
+ start_map: StartByteMap,
+ /// The number of starting state IDs per pattern.
+ stride: usize,
+ /// The total number of patterns for which starting states are encoded.
+ /// This is `None` for DFAs that were built without start states for each
+ /// pattern. Thus, one cannot use this field to say how many patterns
+ /// are in the DFA in all cases. It is specific to how many patterns are
+ /// represented in this start table.
+ pattern_len: Option<usize>,
+ /// The universal starting state for unanchored searches. This is only
+ /// present when the DFA supports unanchored searches and when all starting
+ /// state IDs for an unanchored search are equivalent.
+ universal_start_unanchored: Option<StateID>,
+ /// The universal starting state for anchored searches. This is only
+ /// present when the DFA supports anchored searches and when all starting
+ /// state IDs for an anchored search are equivalent.
+ universal_start_anchored: Option<StateID>,
+}
+
+#[cfg(feature = "dfa-build")]
+impl StartTable<Vec<u32>> {
+ /// Create a valid set of start states all pointing to the dead state.
+ ///
+ /// When the corresponding DFA is constructed with start states for each
+ /// pattern, then `patterns` should be the number of patterns. Otherwise,
+ /// it should be zero.
+ ///
+ /// If the total table size could exceed the allocatable limit, then this
+ /// returns an error. In practice, this is unlikely to be able to occur,
+ /// since it's likely that allocation would have failed long before it got
+ /// to this point.
+ fn dead(
+ kind: StartKind,
+ lookm: &LookMatcher,
+ pattern_len: Option<usize>,
+ ) -> Result<StartTable<Vec<u32>>, BuildError> {
+ if let Some(len) = pattern_len {
+ assert!(len <= PatternID::LIMIT);
+ }
+ let stride = Start::len();
+ // OK because 2*4 is never going to overflow anything.
+ let starts_len = stride.checked_mul(2).unwrap();
+ let pattern_starts_len =
+ match stride.checked_mul(pattern_len.unwrap_or(0)) {
+ Some(x) => x,
+ None => return Err(BuildError::too_many_start_states()),
+ };
+ let table_len = match starts_len.checked_add(pattern_starts_len) {
+ Some(x) => x,
+ None => return Err(BuildError::too_many_start_states()),
+ };
+ if let Err(_) = isize::try_from(table_len) {
+ return Err(BuildError::too_many_start_states());
+ }
+ let table = vec![DEAD.as_u32(); table_len];
+ let start_map = StartByteMap::new(lookm);
+ Ok(StartTable {
+ table,
+ kind,
+ start_map,
+ stride,
+ pattern_len,
+ universal_start_unanchored: None,
+ universal_start_anchored: None,
+ })
+ }
+}
+
+impl<'a> StartTable<&'a [u32]> {
+ /// Deserialize a table of start state IDs starting at the beginning of
+ /// `slice`. Upon success, return the total number of bytes read along with
+ /// the table of starting state IDs.
+ ///
+ /// If there was a problem deserializing any part of the starting IDs,
+ /// then this returns an error. Notably, if the given slice does not have
+ /// the same alignment as `StateID`, then this will return an error (among
+ /// other possible errors).
+ ///
+ /// This is guaranteed to execute in constant time.
+ ///
+ /// # Safety
+ ///
+ /// This routine is not safe because it does not check the validity of the
+ /// starting state IDs themselves. In particular, the number of starting
+ /// IDs can be of variable length, so it's possible that checking their
+ /// validity cannot be done in constant time. An invalid starting state
+ /// ID is not safe because other code may rely on the starting IDs being
+ /// correct (such as explicit bounds check elision). Therefore, an invalid
+ /// start ID can lead to undefined behavior.
+ ///
+ /// Callers that use this function must either pass on the safety invariant
+ /// or guarantee that the bytes given contain valid starting state IDs.
+ /// This guarantee is upheld by the bytes written by `write_to`.
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr().as_usize();
+
+ let (kind, nr) = StartKind::from_bytes(slice)?;
+ slice = &slice[nr..];
+
+ let (start_map, nr) = StartByteMap::from_bytes(slice)?;
+ slice = &slice[nr..];
+
+ let (stride, nr) =
+ wire::try_read_u32_as_usize(slice, "start table stride")?;
+ slice = &slice[nr..];
+ if stride != Start::len() {
+ return Err(DeserializeError::generic(
+ "invalid starting table stride",
+ ));
+ }
+
+ let (maybe_pattern_len, nr) =
+ wire::try_read_u32_as_usize(slice, "start table patterns")?;
+ slice = &slice[nr..];
+ let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX {
+ None
+ } else {
+ Some(maybe_pattern_len)
+ };
+ if pattern_len.map_or(false, |len| len > PatternID::LIMIT) {
+ return Err(DeserializeError::generic(
+ "invalid number of patterns",
+ ));
+ }
+
+ let (universal_unanchored, nr) =
+ wire::try_read_u32(slice, "universal unanchored start")?;
+ slice = &slice[nr..];
+ let universal_start_unanchored = if universal_unanchored == u32::MAX {
+ None
+ } else {
+ Some(StateID::try_from(universal_unanchored).map_err(|e| {
+ DeserializeError::state_id_error(
+ e,
+ "universal unanchored start",
+ )
+ })?)
+ };
+
+ let (universal_anchored, nr) =
+ wire::try_read_u32(slice, "universal anchored start")?;
+ slice = &slice[nr..];
+ let universal_start_anchored = if universal_anchored == u32::MAX {
+ None
+ } else {
+ Some(StateID::try_from(universal_anchored).map_err(|e| {
+ DeserializeError::state_id_error(e, "universal anchored start")
+ })?)
+ };
+
+ let pattern_table_size = wire::mul(
+ stride,
+ pattern_len.unwrap_or(0),
+ "invalid pattern length",
+ )?;
+ // Our start states always start with a two stride of start states for
+ // the entire automaton. The first stride is for unanchored starting
+ // states and the second stride is for anchored starting states. What
+ // follows it are an optional set of start states for each pattern.
+ let start_state_len = wire::add(
+ wire::mul(2, stride, "start state stride too big")?,
+ pattern_table_size,
+ "invalid 'any' pattern starts size",
+ )?;
+ let table_bytes_len = wire::mul(
+ start_state_len,
+ StateID::SIZE,
+ "pattern table bytes length",
+ )?;
+ wire::check_slice_len(slice, table_bytes_len, "start ID table")?;
+ wire::check_alignment::<StateID>(slice)?;
+ let table_bytes = &slice[..table_bytes_len];
+ slice = &slice[table_bytes_len..];
+ // SAFETY: Since StateID is always representable as a u32, all we need
+ // to do is ensure that we have the proper length and alignment. We've
+ // checked both above, so the cast below is safe.
+ //
+ // N.B. This is the only not-safe code in this function.
+ let table = core::slice::from_raw_parts(
+ table_bytes.as_ptr().cast::<u32>(),
+ start_state_len,
+ );
+ let st = StartTable {
+ table,
+ kind,
+ start_map,
+ stride,
+ pattern_len,
+ universal_start_unanchored,
+ universal_start_anchored,
+ };
+ Ok((st, slice.as_ptr().as_usize() - slice_start))
+ }
+}
+
+impl<T: AsRef<[u32]>> StartTable<T> {
+ /// Writes a serialized form of this start table to the buffer given. If
+ /// the buffer is too small, then an error is returned. To determine how
+ /// big the buffer must be, use `write_to_len`.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small(
+ "starting table ids",
+ ));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write start kind
+ let nw = self.kind.write_to::<E>(dst)?;
+ dst = &mut dst[nw..];
+ // write start byte map
+ let nw = self.start_map.write_to(dst)?;
+ dst = &mut dst[nw..];
+ // write stride
+ // Unwrap is OK since the stride is always 4 (currently).
+ E::write_u32(u32::try_from(self.stride).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+ // write pattern length
+ // Unwrap is OK since number of patterns is guaranteed to fit in a u32.
+ E::write_u32(
+ u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write universal start unanchored state id, u32::MAX if absent
+ E::write_u32(
+ self.universal_start_unanchored
+ .map_or(u32::MAX, |sid| sid.as_u32()),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write universal start anchored state id, u32::MAX if absent
+ E::write_u32(
+ self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write start IDs
+ for &sid in self.table() {
+ let n = wire::write_state_id::<E>(sid, &mut dst);
+ dst = &mut dst[n..];
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this start ID table
+ /// will use.
+ fn write_to_len(&self) -> usize {
+ self.kind.write_to_len()
+ + self.start_map.write_to_len()
+ + size_of::<u32>() // stride
+ + size_of::<u32>() // # patterns
+ + size_of::<u32>() // universal unanchored start
+ + size_of::<u32>() // universal anchored start
+ + (self.table().len() * StateID::SIZE)
+ }
+
+ /// Validates that every state ID in this start table is valid by checking
+ /// it against the given transition table (which must be for the same DFA).
+ ///
+ /// That is, every state ID can be used to correctly index a state.
+ fn validate(
+ &self,
+ tt: &TransitionTable<T>,
+ ) -> Result<(), DeserializeError> {
+ if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) {
+ return Err(DeserializeError::generic(
+ "found invalid universal unanchored starting state ID",
+ ));
+ }
+ if !self.universal_start_anchored.map_or(true, |s| tt.is_valid(s)) {
+ return Err(DeserializeError::generic(
+ "found invalid universal anchored starting state ID",
+ ));
+ }
+ for &id in self.table() {
+ if !tt.is_valid(id) {
+ return Err(DeserializeError::generic(
+ "found invalid starting state ID",
+ ));
+ }
+ }
+ Ok(())
+ }
+
+ /// Converts this start list to a borrowed value.
+ fn as_ref(&self) -> StartTable<&'_ [u32]> {
+ StartTable {
+ table: self.table.as_ref(),
+ kind: self.kind,
+ start_map: self.start_map.clone(),
+ stride: self.stride,
+ pattern_len: self.pattern_len,
+ universal_start_unanchored: self.universal_start_unanchored,
+ universal_start_anchored: self.universal_start_anchored,
+ }
+ }
+
+ /// Converts this start list to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> StartTable<alloc::vec::Vec<u32>> {
+ StartTable {
+ table: self.table.as_ref().to_vec(),
+ kind: self.kind,
+ start_map: self.start_map.clone(),
+ stride: self.stride,
+ pattern_len: self.pattern_len,
+ universal_start_unanchored: self.universal_start_unanchored,
+ universal_start_anchored: self.universal_start_anchored,
+ }
+ }
+
+ /// Return the start state for the given input and starting configuration.
+ /// This returns an error if the input configuration is not supported by
+ /// this DFA. For example, requesting an unanchored search when the DFA was
+ /// not built with unanchored starting states. Or asking for an anchored
+ /// pattern search with an invalid pattern ID or on a DFA that was not
+ /// built with start states for each pattern.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn start(
+ &self,
+ input: &Input<'_>,
+ start: Start,
+ ) -> Result<StateID, MatchError> {
+ let start_index = start.as_usize();
+ let mode = input.get_anchored();
+ let index = match mode {
+ Anchored::No => {
+ if !self.kind.has_unanchored() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ start_index
+ }
+ Anchored::Yes => {
+ if !self.kind.has_anchored() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ self.stride + start_index
+ }
+ Anchored::Pattern(pid) => {
+ let len = match self.pattern_len {
+ None => {
+ return Err(MatchError::unsupported_anchored(mode))
+ }
+ Some(len) => len,
+ };
+ if pid.as_usize() >= len {
+ return Ok(DEAD);
+ }
+ (2 * self.stride)
+ + (self.stride * pid.as_usize())
+ + start_index
+ }
+ };
+ Ok(self.table()[index])
+ }
+
+ /// Returns an iterator over all start state IDs in this table.
+ ///
+ /// Each item is a triple of: start state ID, the start state type and the
+ /// pattern ID (if any).
+ fn iter(&self) -> StartStateIter<'_> {
+ StartStateIter { st: self.as_ref(), i: 0 }
+ }
+
+ /// Returns the table as a slice of state IDs.
+ fn table(&self) -> &[StateID] {
+ wire::u32s_to_state_ids(self.table.as_ref())
+ }
+
+ /// Return the memory usage, in bytes, of this start list.
+ ///
+ /// This does not include the size of a `StartList` value itself.
+ fn memory_usage(&self) -> usize {
+ self.table().len() * StateID::SIZE
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl<T: AsMut<[u32]>> StartTable<T> {
+ /// Set the start state for the given index and pattern.
+ ///
+ /// If the pattern ID or state ID are not valid, then this will panic.
+ fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) {
+ let start_index = start.as_usize();
+ let index = match anchored {
+ Anchored::No => start_index,
+ Anchored::Yes => self.stride + start_index,
+ Anchored::Pattern(pid) => {
+ let pid = pid.as_usize();
+ let len = self
+ .pattern_len
+ .expect("start states for each pattern enabled");
+ assert!(pid < len, "invalid pattern ID {:?}", pid);
+ self.stride
+ .checked_mul(pid)
+ .unwrap()
+ .checked_add(self.stride.checked_mul(2).unwrap())
+ .unwrap()
+ .checked_add(start_index)
+ .unwrap()
+ }
+ };
+ self.table_mut()[index] = id;
+ }
+
+ /// Returns the table as a mutable slice of state IDs.
+ fn table_mut(&mut self) -> &mut [StateID] {
+ wire::u32s_to_state_ids_mut(self.table.as_mut())
+ }
+}
+
+/// An iterator over start state IDs.
+///
+/// This iterator yields a triple of start state ID, the anchored mode and the
+/// start state type. If a pattern ID is relevant, then the anchored mode will
+/// contain it. Start states with an anchored mode containing a pattern ID will
+/// only occur when the DFA was compiled with start states for each pattern
+/// (which is disabled by default).
+pub(crate) struct StartStateIter<'a> {
+ st: StartTable<&'a [u32]>,
+ i: usize,
+}
+
+impl<'a> Iterator for StartStateIter<'a> {
+ type Item = (StateID, Anchored, Start);
+
+ fn next(&mut self) -> Option<(StateID, Anchored, Start)> {
+ let i = self.i;
+ let table = self.st.table();
+ if i >= table.len() {
+ return None;
+ }
+ self.i += 1;
+
+ // This unwrap is okay since the stride of the starting state table
+ // must always match the number of start state types.
+ let start_type = Start::from_usize(i % self.st.stride).unwrap();
+ let anchored = if i < self.st.stride {
+ Anchored::No
+ } else if i < (2 * self.st.stride) {
+ Anchored::Yes
+ } else {
+ let pid = (i - (2 * self.st.stride)) / self.st.stride;
+ Anchored::Pattern(PatternID::new(pid).unwrap())
+ };
+ Some((table[i], anchored, start_type))
+ }
+}
+
+/// This type represents that patterns that should be reported whenever a DFA
+/// enters a match state. This structure exists to support DFAs that search for
+/// matches for multiple regexes.
+///
+/// This structure relies on the fact that all match states in a DFA occur
+/// contiguously in the DFA's transition table. (See dfa/special.rs for a more
+/// detailed breakdown of the representation.) Namely, when a match occurs, we
+/// know its state ID. Since we know the start and end of the contiguous region
+/// of match states, we can use that to compute the position at which the match
+/// state occurs. That in turn is used as an offset into this structure.
+#[derive(Clone, Debug)]
+struct MatchStates<T> {
+ /// Slices is a flattened sequence of pairs, where each pair points to a
+ /// sub-slice of pattern_ids. The first element of the pair is an offset
+ /// into pattern_ids and the second element of the pair is the number
+ /// of 32-bit pattern IDs starting at that position. That is, each pair
+ /// corresponds to a single DFA match state and its corresponding match
+ /// IDs. The number of pairs always corresponds to the number of distinct
+ /// DFA match states.
+ ///
+ /// In practice, T is either Vec<u32> or &[u32].
+ slices: T,
+ /// A flattened sequence of pattern IDs for each DFA match state. The only
+ /// way to correctly read this sequence is indirectly via `slices`.
+ ///
+ /// In practice, T is either Vec<u32> or &[u32].
+ pattern_ids: T,
+ /// The total number of unique patterns represented by these match states.
+ pattern_len: usize,
+}
+
+impl<'a> MatchStates<&'a [u32]> {
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr().as_usize();
+
+ // Read the total number of match states.
+ let (state_len, nr) =
+ wire::try_read_u32_as_usize(slice, "match state length")?;
+ slice = &slice[nr..];
+
+ // Read the slice start/length pairs.
+ let pair_len = wire::mul(2, state_len, "match state offset pairs")?;
+ let slices_bytes_len = wire::mul(
+ pair_len,
+ PatternID::SIZE,
+ "match state slice offset byte length",
+ )?;
+ wire::check_slice_len(slice, slices_bytes_len, "match state slices")?;
+ wire::check_alignment::<PatternID>(slice)?;
+ let slices_bytes = &slice[..slices_bytes_len];
+ slice = &slice[slices_bytes_len..];
+ // SAFETY: Since PatternID is always representable as a u32, all we
+ // need to do is ensure that we have the proper length and alignment.
+ // We've checked both above, so the cast below is safe.
+ //
+ // N.B. This is one of the few not-safe snippets in this function,
+ // so we mark it explicitly to call it out.
+ let slices = core::slice::from_raw_parts(
+ slices_bytes.as_ptr().cast::<u32>(),
+ pair_len,
+ );
+
+ // Read the total number of unique pattern IDs (which is always 1 more
+ // than the maximum pattern ID in this automaton, since pattern IDs are
+ // handed out contiguously starting at 0).
+ let (pattern_len, nr) =
+ wire::try_read_u32_as_usize(slice, "pattern length")?;
+ slice = &slice[nr..];
+
+ // Now read the pattern ID length. We don't need to store this
+ // explicitly, but we need it to know how many pattern IDs to read.
+ let (idlen, nr) =
+ wire::try_read_u32_as_usize(slice, "pattern ID length")?;
+ slice = &slice[nr..];
+
+ // Read the actual pattern IDs.
+ let pattern_ids_len =
+ wire::mul(idlen, PatternID::SIZE, "pattern ID byte length")?;
+ wire::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?;
+ wire::check_alignment::<PatternID>(slice)?;
+ let pattern_ids_bytes = &slice[..pattern_ids_len];
+ slice = &slice[pattern_ids_len..];
+ // SAFETY: Since PatternID is always representable as a u32, all we
+ // need to do is ensure that we have the proper length and alignment.
+ // We've checked both above, so the cast below is safe.
+ //
+ // N.B. This is one of the few not-safe snippets in this function,
+ // so we mark it explicitly to call it out.
+ let pattern_ids = core::slice::from_raw_parts(
+ pattern_ids_bytes.as_ptr().cast::<u32>(),
+ idlen,
+ );
+
+ let ms = MatchStates { slices, pattern_ids, pattern_len };
+ Ok((ms, slice.as_ptr().as_usize() - slice_start))
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl MatchStates<Vec<u32>> {
+ fn empty(pattern_len: usize) -> MatchStates<Vec<u32>> {
+ assert!(pattern_len <= PatternID::LIMIT);
+ MatchStates { slices: vec![], pattern_ids: vec![], pattern_len }
+ }
+
+ fn new(
+ matches: &BTreeMap<StateID, Vec<PatternID>>,
+ pattern_len: usize,
+ ) -> Result<MatchStates<Vec<u32>>, BuildError> {
+ let mut m = MatchStates::empty(pattern_len);
+ for (_, pids) in matches.iter() {
+ let start = PatternID::new(m.pattern_ids.len())
+ .map_err(|_| BuildError::too_many_match_pattern_ids())?;
+ m.slices.push(start.as_u32());
+ // This is always correct since the number of patterns in a single
+ // match state can never exceed maximum number of allowable
+ // patterns. Why? Because a pattern can only appear once in a
+ // particular match state, by construction. (And since our pattern
+ // ID limit is one less than u32::MAX, we're guaranteed that the
+ // length fits in a u32.)
+ m.slices.push(u32::try_from(pids.len()).unwrap());
+ for &pid in pids {
+ m.pattern_ids.push(pid.as_u32());
+ }
+ }
+ m.pattern_len = pattern_len;
+ Ok(m)
+ }
+
+ fn new_with_map(
+ &self,
+ matches: &BTreeMap<StateID, Vec<PatternID>>,
+ ) -> Result<MatchStates<Vec<u32>>, BuildError> {
+ MatchStates::new(matches, self.pattern_len)
+ }
+}
+
+impl<T: AsRef<[u32]>> MatchStates<T> {
+ /// Writes a serialized form of these match states to the buffer given. If
+ /// the buffer is too small, then an error is returned. To determine how
+ /// big the buffer must be, use `write_to_len`.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("match states"));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write state ID length
+ // Unwrap is OK since number of states is guaranteed to fit in a u32.
+ E::write_u32(u32::try_from(self.len()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write slice offset pairs
+ for &pid in self.slices() {
+ let n = wire::write_pattern_id::<E>(pid, &mut dst);
+ dst = &mut dst[n..];
+ }
+
+ // write unique pattern ID length
+ // Unwrap is OK since number of patterns is guaranteed to fit in a u32.
+ E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write pattern ID length
+ // Unwrap is OK since we check at construction (and deserialization)
+ // that the number of patterns is representable as a u32.
+ E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write pattern IDs
+ for &pid in self.pattern_ids() {
+ let n = wire::write_pattern_id::<E>(pid, &mut dst);
+ dst = &mut dst[n..];
+ }
+
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of these match states
+ /// will use.
+ fn write_to_len(&self) -> usize {
+ size_of::<u32>() // match state length
+ + (self.slices().len() * PatternID::SIZE)
+ + size_of::<u32>() // unique pattern ID length
+ + size_of::<u32>() // pattern ID length
+ + (self.pattern_ids().len() * PatternID::SIZE)
+ }
+
+ /// Valides that the match state info is itself internally consistent and
+ /// consistent with the recorded match state region in the given DFA.
+ fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> {
+ if self.len() != dfa.special.match_len(dfa.stride()) {
+ return Err(DeserializeError::generic(
+ "match state length mismatch",
+ ));
+ }
+ for si in 0..self.len() {
+ let start = self.slices()[si * 2].as_usize();
+ let len = self.slices()[si * 2 + 1].as_usize();
+ if start >= self.pattern_ids().len() {
+ return Err(DeserializeError::generic(
+ "invalid pattern ID start offset",
+ ));
+ }
+ if start + len > self.pattern_ids().len() {
+ return Err(DeserializeError::generic(
+ "invalid pattern ID length",
+ ));
+ }
+ for mi in 0..len {
+ let pid = self.pattern_id(si, mi);
+ if pid.as_usize() >= self.pattern_len {
+ return Err(DeserializeError::generic(
+ "invalid pattern ID",
+ ));
+ }
+ }
+ }
+ Ok(())
+ }
+
+ /// Converts these match states back into their map form. This is useful
+ /// when shuffling states, as the normal MatchStates representation is not
+ /// amenable to easy state swapping. But with this map, to swap id1 and
+ /// id2, all you need to do is:
+ ///
+ /// if let Some(pids) = map.remove(&id1) {
+ /// map.insert(id2, pids);
+ /// }
+ ///
+ /// Once shuffling is done, use MatchStates::new to convert back.
+ #[cfg(feature = "dfa-build")]
+ fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> {
+ let mut map = BTreeMap::new();
+ for i in 0..self.len() {
+ let mut pids = vec![];
+ for j in 0..self.pattern_len(i) {
+ pids.push(self.pattern_id(i, j));
+ }
+ map.insert(self.match_state_id(dfa, i), pids);
+ }
+ map
+ }
+
+ /// Converts these match states to a borrowed value.
+ fn as_ref(&self) -> MatchStates<&'_ [u32]> {
+ MatchStates {
+ slices: self.slices.as_ref(),
+ pattern_ids: self.pattern_ids.as_ref(),
+ pattern_len: self.pattern_len,
+ }
+ }
+
+ /// Converts these match states to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> MatchStates<alloc::vec::Vec<u32>> {
+ MatchStates {
+ slices: self.slices.as_ref().to_vec(),
+ pattern_ids: self.pattern_ids.as_ref().to_vec(),
+ pattern_len: self.pattern_len,
+ }
+ }
+
+ /// Returns the match state ID given the match state index. (Where the
+ /// first match state corresponds to index 0.)
+ ///
+ /// This panics if there is no match state at the given index.
+ fn match_state_id(&self, dfa: &DFA<T>, index: usize) -> StateID {
+ assert!(dfa.special.matches(), "no match states to index");
+ // This is one of the places where we rely on the fact that match
+ // states are contiguous in the transition table. Namely, that the
+ // first match state ID always corresponds to dfa.special.min_start.
+ // From there, since we know the stride, we can compute the ID of any
+ // match state given its index.
+ let stride2 = u32::try_from(dfa.stride2()).unwrap();
+ let offset = index.checked_shl(stride2).unwrap();
+ let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap();
+ let sid = StateID::new(id).unwrap();
+ assert!(dfa.is_match_state(sid));
+ sid
+ }
+
+ /// Returns the pattern ID at the given match index for the given match
+ /// state.
+ ///
+ /// The match state index is the state index minus the state index of the
+ /// first match state in the DFA.
+ ///
+ /// The match index is the index of the pattern ID for the given state.
+ /// The index must be less than `self.pattern_len(state_index)`.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID {
+ self.pattern_id_slice(state_index)[match_index]
+ }
+
+ /// Returns the number of patterns in the given match state.
+ ///
+ /// The match state index is the state index minus the state index of the
+ /// first match state in the DFA.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn pattern_len(&self, state_index: usize) -> usize {
+ self.slices()[state_index * 2 + 1].as_usize()
+ }
+
+ /// Returns all of the pattern IDs for the given match state index.
+ ///
+ /// The match state index is the state index minus the state index of the
+ /// first match state in the DFA.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] {
+ let start = self.slices()[state_index * 2].as_usize();
+ let len = self.pattern_len(state_index);
+ &self.pattern_ids()[start..start + len]
+ }
+
+ /// Returns the pattern ID offset slice of u32 as a slice of PatternID.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn slices(&self) -> &[PatternID] {
+ wire::u32s_to_pattern_ids(self.slices.as_ref())
+ }
+
+ /// Returns the total number of match states.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn len(&self) -> usize {
+ assert_eq!(0, self.slices().len() % 2);
+ self.slices().len() / 2
+ }
+
+ /// Returns the pattern ID slice of u32 as a slice of PatternID.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn pattern_ids(&self) -> &[PatternID] {
+ wire::u32s_to_pattern_ids(self.pattern_ids.as_ref())
+ }
+
+ /// Return the memory usage, in bytes, of these match pairs.
+ fn memory_usage(&self) -> usize {
+ (self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE
+ }
+}
+
+/// A common set of flags for both dense and sparse DFAs. This primarily
+/// centralizes the serialization format of these flags at a bitset.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Flags {
+ /// Whether the DFA can match the empty string. When this is false, all
+ /// matches returned by this DFA are guaranteed to have non-zero length.
+ pub(crate) has_empty: bool,
+ /// Whether the DFA should only produce matches with spans that correspond
+ /// to valid UTF-8. This also includes omitting any zero-width matches that
+ /// split the UTF-8 encoding of a codepoint.
+ pub(crate) is_utf8: bool,
+ /// Whether the DFA is always anchored or not, regardless of `Input`
+ /// configuration. This is useful for avoiding a reverse scan even when
+ /// executing unanchored searches.
+ pub(crate) is_always_start_anchored: bool,
+}
+
+impl Flags {
+ /// Creates a set of flags for a DFA from an NFA.
+ ///
+ /// N.B. This constructor was defined at the time of writing because all
+ /// of the flags are derived directly from the NFA. If this changes in the
+ /// future, we might be more thoughtful about how the `Flags` value is
+ /// itself built.
+ #[cfg(feature = "dfa-build")]
+ fn from_nfa(nfa: &thompson::NFA) -> Flags {
+ Flags {
+ has_empty: nfa.has_empty(),
+ is_utf8: nfa.is_utf8(),
+ is_always_start_anchored: nfa.is_always_start_anchored(),
+ }
+ }
+
+ /// Deserializes the flags from the given slice. On success, this also
+ /// returns the number of bytes read from the slice.
+ pub(crate) fn from_bytes(
+ slice: &[u8],
+ ) -> Result<(Flags, usize), DeserializeError> {
+ let (bits, nread) = wire::try_read_u32(slice, "flag bitset")?;
+ let flags = Flags {
+ has_empty: bits & (1 << 0) != 0,
+ is_utf8: bits & (1 << 1) != 0,
+ is_always_start_anchored: bits & (1 << 2) != 0,
+ };
+ Ok((flags, nread))
+ }
+
+ /// Writes these flags to the given byte slice. If the buffer is too small,
+ /// then an error is returned. To determine how big the buffer must be,
+ /// use `write_to_len`.
+ pub(crate) fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ fn bool_to_int(b: bool) -> u32 {
+ if b {
+ 1
+ } else {
+ 0
+ }
+ }
+
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("flag bitset"));
+ }
+ let bits = (bool_to_int(self.has_empty) << 0)
+ | (bool_to_int(self.is_utf8) << 1)
+ | (bool_to_int(self.is_always_start_anchored) << 2);
+ E::write_u32(bits, dst);
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of these flags
+ /// will use.
+ pub(crate) fn write_to_len(&self) -> usize {
+ size_of::<u32>()
+ }
+}
+
+/// An iterator over all states in a DFA.
+///
+/// This iterator yields a tuple for each state. The first element of the
+/// tuple corresponds to a state's identifier, and the second element
+/// corresponds to the state itself (comprised of its transitions).
+///
+/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to
+/// the type of the transition table itself.
+pub(crate) struct StateIter<'a, T> {
+ tt: &'a TransitionTable<T>,
+ it: iter::Enumerate<slice::Chunks<'a, StateID>>,
+}
+
+impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> {
+ type Item = State<'a>;
+
+ fn next(&mut self) -> Option<State<'a>> {
+ self.it.next().map(|(index, _)| {
+ let id = self.tt.to_state_id(index);
+ self.tt.state(id)
+ })
+ }
+}
+
+/// An immutable representation of a single DFA state.
+///
+/// `'a` correspondings to the lifetime of a DFA's transition table.
+pub(crate) struct State<'a> {
+ id: StateID,
+ stride2: usize,
+ transitions: &'a [StateID],
+}
+
+impl<'a> State<'a> {
+ /// Return an iterator over all transitions in this state. This yields
+ /// a number of transitions equivalent to the alphabet length of the
+ /// corresponding DFA.
+ ///
+ /// Each transition is represented by a tuple. The first element is
+ /// the input byte for that transition and the second element is the
+ /// transitions itself.
+ pub(crate) fn transitions(&self) -> StateTransitionIter<'_> {
+ StateTransitionIter {
+ len: self.transitions.len(),
+ it: self.transitions.iter().enumerate(),
+ }
+ }
+
+ /// Return an iterator over a sparse representation of the transitions in
+ /// this state. Only non-dead transitions are returned.
+ ///
+ /// The "sparse" representation in this case corresponds to a sequence of
+ /// triples. The first two elements of the triple comprise an inclusive
+ /// byte range while the last element corresponds to the transition taken
+ /// for all bytes in the range.
+ ///
+ /// This is somewhat more condensed than the classical sparse
+ /// representation (where you have an element for every non-dead
+ /// transition), but in practice, checking if a byte is in a range is very
+ /// cheap and using ranges tends to conserve quite a bit more space.
+ pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> {
+ StateSparseTransitionIter { dense: self.transitions(), cur: None }
+ }
+
+ /// Returns the identifier for this state.
+ pub(crate) fn id(&self) -> StateID {
+ self.id
+ }
+
+ /// Analyzes this state to determine whether it can be accelerated. If so,
+ /// it returns an accelerator that contains at least one byte.
+ #[cfg(feature = "dfa-build")]
+ fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> {
+ // We just try to add bytes to our accelerator. Once adding fails
+ // (because we've added too many bytes), then give up.
+ let mut accel = Accel::new();
+ for (class, id) in self.transitions() {
+ if id == self.id() {
+ continue;
+ }
+ for unit in classes.elements(class) {
+ if let Some(byte) = unit.as_u8() {
+ if !accel.add(byte) {
+ return None;
+ }
+ }
+ }
+ }
+ if accel.is_empty() {
+ None
+ } else {
+ Some(accel)
+ }
+ }
+}
+
+impl<'a> fmt::Debug for State<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ for (i, (start, end, sid)) in self.sparse_transitions().enumerate() {
+ let id = if f.alternate() {
+ sid.as_usize()
+ } else {
+ sid.as_usize() >> self.stride2
+ };
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ if start == end {
+ write!(f, "{:?} => {:?}", start, id)?;
+ } else {
+ write!(f, "{:?}-{:?} => {:?}", start, end, id)?;
+ }
+ }
+ Ok(())
+ }
+}
+
+/// An iterator over all transitions in a single DFA state. This yields
+/// a number of transitions equivalent to the alphabet length of the
+/// corresponding DFA.
+///
+/// Each transition is represented by a tuple. The first element is the input
+/// byte for that transition and the second element is the transition itself.
+#[derive(Debug)]
+pub(crate) struct StateTransitionIter<'a> {
+ len: usize,
+ it: iter::Enumerate<slice::Iter<'a, StateID>>,
+}
+
+impl<'a> Iterator for StateTransitionIter<'a> {
+ type Item = (alphabet::Unit, StateID);
+
+ fn next(&mut self) -> Option<(alphabet::Unit, StateID)> {
+ self.it.next().map(|(i, &id)| {
+ let unit = if i + 1 == self.len {
+ alphabet::Unit::eoi(i)
+ } else {
+ let b = u8::try_from(i)
+ .expect("raw byte alphabet is never exceeded");
+ alphabet::Unit::u8(b)
+ };
+ (unit, id)
+ })
+ }
+}
+
+/// An iterator over all non-DEAD transitions in a single DFA state using a
+/// sparse representation.
+///
+/// Each transition is represented by a triple. The first two elements of the
+/// triple comprise an inclusive byte range while the last element corresponds
+/// to the transition taken for all bytes in the range.
+///
+/// As a convenience, this always returns `alphabet::Unit` values of the same
+/// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte,
+/// byte) and (EOI, EOI) values are yielded.
+#[derive(Debug)]
+pub(crate) struct StateSparseTransitionIter<'a> {
+ dense: StateTransitionIter<'a>,
+ cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>,
+}
+
+impl<'a> Iterator for StateSparseTransitionIter<'a> {
+ type Item = (alphabet::Unit, alphabet::Unit, StateID);
+
+ fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> {
+ while let Some((unit, next)) = self.dense.next() {
+ let (prev_start, prev_end, prev_next) = match self.cur {
+ Some(t) => t,
+ None => {
+ self.cur = Some((unit, unit, next));
+ continue;
+ }
+ };
+ if prev_next == next && !unit.is_eoi() {
+ self.cur = Some((prev_start, unit, prev_next));
+ } else {
+ self.cur = Some((unit, unit, next));
+ if prev_next != DEAD {
+ return Some((prev_start, prev_end, prev_next));
+ }
+ }
+ }
+ if let Some((start, end, next)) = self.cur.take() {
+ if next != DEAD {
+ return Some((start, end, next));
+ }
+ }
+ None
+ }
+}
+
+/// An error that occurred during the construction of a DFA.
+///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying [`nfa::thompson::BuildError`](thompson::BuildError)
+/// type from its `source` method via the `std::error::Error` trait. This error
+/// only occurs when using convenience routines for building a DFA directly
+/// from a pattern string.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[cfg(feature = "dfa-build")]
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: BuildErrorKind,
+}
+
+/// The kind of error that occurred during the construction of a DFA.
+///
+/// Note that this error is non-exhaustive. Adding new variants is not
+/// considered a breaking change.
+#[cfg(feature = "dfa-build")]
+#[derive(Clone, Debug)]
+enum BuildErrorKind {
+ /// An error that occurred while constructing an NFA as a precursor step
+ /// before a DFA is compiled.
+ NFA(thompson::BuildError),
+ /// An error that occurred because an unsupported regex feature was used.
+ /// The message string describes which unsupported feature was used.
+ ///
+ /// The primary regex feature that is unsupported by DFAs is the Unicode
+ /// word boundary look-around assertion (`\b`). This can be worked around
+ /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling
+ /// Unicode word boundaries when building a DFA.
+ Unsupported(&'static str),
+ /// An error that occurs if too many states are produced while building a
+ /// DFA.
+ TooManyStates,
+ /// An error that occurs if too many start states are needed while building
+ /// a DFA.
+ ///
+ /// This is a kind of oddball error that occurs when building a DFA with
+ /// start states enabled for each pattern and enough patterns to cause
+ /// the table of start states to overflow `usize`.
+ TooManyStartStates,
+ /// This is another oddball error that can occur if there are too many
+ /// patterns spread out across too many match states.
+ TooManyMatchPatternIDs,
+ /// An error that occurs if the DFA got too big during determinization.
+ DFAExceededSizeLimit { limit: usize },
+ /// An error that occurs if auxiliary storage (not the DFA) used during
+ /// determinization got too big.
+ DeterminizeExceededSizeLimit { limit: usize },
+}
+
+#[cfg(feature = "dfa-build")]
+impl BuildError {
+ /// Return the kind of this error.
+ fn kind(&self) -> &BuildErrorKind {
+ &self.kind
+ }
+
+ pub(crate) fn nfa(err: thompson::BuildError) -> BuildError {
+ BuildError { kind: BuildErrorKind::NFA(err) }
+ }
+
+ pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
+ let msg = "cannot build DFAs for regexes with Unicode word \
+ boundaries; switch to ASCII word boundaries, or \
+ heuristically enable Unicode word boundaries or use a \
+ different regex engine";
+ BuildError { kind: BuildErrorKind::Unsupported(msg) }
+ }
+
+ pub(crate) fn too_many_states() -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyStates }
+ }
+
+ pub(crate) fn too_many_start_states() -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyStartStates }
+ }
+
+ pub(crate) fn too_many_match_pattern_ids() -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyMatchPatternIDs }
+ }
+
+ pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> BuildError {
+ BuildError { kind: BuildErrorKind::DFAExceededSizeLimit { limit } }
+ }
+
+ pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> BuildError {
+ BuildError {
+ kind: BuildErrorKind::DeterminizeExceededSizeLimit { limit },
+ }
+ }
+}
+
+#[cfg(all(feature = "std", feature = "dfa-build"))]
+impl std::error::Error for BuildError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match self.kind() {
+ BuildErrorKind::NFA(ref err) => Some(err),
+ _ => None,
+ }
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind() {
+ BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
+ BuildErrorKind::Unsupported(ref msg) => {
+ write!(f, "unsupported regex feature for DFAs: {}", msg)
+ }
+ BuildErrorKind::TooManyStates => write!(
+ f,
+ "number of DFA states exceeds limit of {}",
+ StateID::LIMIT,
+ ),
+ BuildErrorKind::TooManyStartStates => {
+ let stride = Start::len();
+ // The start table has `stride` entries for starting states for
+ // the entire DFA, and then `stride` entries for each pattern
+ // if start states for each pattern are enabled (which is the
+ // only way this error can occur). Thus, the total number of
+ // patterns that can fit in the table is `stride` less than
+ // what we can allocate.
+ let max = usize::try_from(core::isize::MAX).unwrap();
+ let limit = (max - stride) / stride;
+ write!(
+ f,
+ "compiling DFA with start states exceeds pattern \
+ pattern limit of {}",
+ limit,
+ )
+ }
+ BuildErrorKind::TooManyMatchPatternIDs => write!(
+ f,
+ "compiling DFA with total patterns in all match states \
+ exceeds limit of {}",
+ PatternID::LIMIT,
+ ),
+ BuildErrorKind::DFAExceededSizeLimit { limit } => write!(
+ f,
+ "DFA exceeded size limit of {:?} during determinization",
+ limit,
+ ),
+ BuildErrorKind::DeterminizeExceededSizeLimit { limit } => {
+ write!(f, "determinization exceeded size limit of {:?}", limit)
+ }
+ }
+ }
+}
+
+#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn errors_with_unicode_word_boundary() {
+ let pattern = r"\b";
+ assert!(Builder::new().build(pattern).is_err());
+ }
+
+ #[test]
+ fn roundtrip_never_match() {
+ let dfa = DFA::never_match().unwrap();
+ let (buf, _) = dfa.to_bytes_native_endian();
+ let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
+
+ assert_eq!(None, dfa.try_search_fwd(&Input::new("foo12345")).unwrap());
+ }
+
+ #[test]
+ fn roundtrip_always_match() {
+ use crate::HalfMatch;
+
+ let dfa = DFA::always_match().unwrap();
+ let (buf, _) = dfa.to_bytes_native_endian();
+ let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
+
+ assert_eq!(
+ Some(HalfMatch::must(0, 0)),
+ dfa.try_search_fwd(&Input::new("foo12345")).unwrap()
+ );
+ }
+
+ // See the analogous test in src/hybrid/dfa.rs.
+ #[test]
+ fn heuristic_unicode_reverse() {
+ let dfa = DFA::builder()
+ .configure(DFA::config().unicode_word_boundary(true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build(r"\b[0-9]+\b")
+ .unwrap();
+
+ let input = Input::new("β123").range(2..);
+ let expected = MatchError::quit(0xB2, 1);
+ let got = dfa.try_search_rev(&input);
+ assert_eq!(Err(expected), got);
+
+ let input = Input::new("123β").range(..3);
+ let expected = MatchError::quit(0xCE, 3);
+ let got = dfa.try_search_rev(&input);
+ assert_eq!(Err(expected), got);
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/determinize.rs b/third_party/rust/regex-automata/src/dfa/determinize.rs
new file mode 100644
index 0000000000..19f99f5d64
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/determinize.rs
@@ -0,0 +1,599 @@
+use alloc::{collections::BTreeMap, vec::Vec};
+
+use crate::{
+ dfa::{
+ dense::{self, BuildError},
+ DEAD,
+ },
+ nfa::thompson,
+ util::{
+ self,
+ alphabet::{self, ByteSet},
+ determinize::{State, StateBuilderEmpty, StateBuilderNFA},
+ primitives::{PatternID, StateID},
+ search::{Anchored, MatchKind},
+ sparse_set::SparseSets,
+ start::Start,
+ },
+};
+
+/// A builder for configuring and running a DFA determinizer.
+#[derive(Clone, Debug)]
+pub(crate) struct Config {
+ match_kind: MatchKind,
+ quit: ByteSet,
+ dfa_size_limit: Option<usize>,
+ determinize_size_limit: Option<usize>,
+}
+
+impl Config {
+ /// Create a new default config for a determinizer. The determinizer may be
+ /// configured before calling `run`.
+ pub fn new() -> Config {
+ Config {
+ match_kind: MatchKind::LeftmostFirst,
+ quit: ByteSet::empty(),
+ dfa_size_limit: None,
+ determinize_size_limit: None,
+ }
+ }
+
+ /// Run determinization on the given NFA and write the resulting DFA into
+ /// the one given. The DFA given should be initialized but otherwise empty.
+ /// "Initialized" means that it is setup to handle the NFA's byte classes,
+ /// number of patterns and whether to build start states for each pattern.
+ pub fn run(
+ &self,
+ nfa: &thompson::NFA,
+ dfa: &mut dense::OwnedDFA,
+ ) -> Result<(), BuildError> {
+ let dead = State::dead();
+ let quit = State::dead();
+ let mut cache = StateMap::default();
+ // We only insert the dead state here since its representation is
+ // identical to the quit state. And we never want anything pointing
+ // to the quit state other than specific transitions derived from the
+ // determinizer's configured "quit" bytes.
+ //
+ // We do put the quit state into 'builder_states' below. This ensures
+ // that a proper DFA state ID is allocated for it, and that no other
+ // DFA state uses the "location after the DEAD state." That is, it
+ // is assumed that the quit state is always the state immediately
+ // following the DEAD state.
+ cache.insert(dead.clone(), DEAD);
+
+ let runner = Runner {
+ config: self.clone(),
+ nfa,
+ dfa,
+ builder_states: alloc::vec![dead, quit],
+ cache,
+ memory_usage_state: 0,
+ sparses: SparseSets::new(nfa.states().len()),
+ stack: alloc::vec![],
+ scratch_state_builder: StateBuilderEmpty::new(),
+ };
+ runner.run()
+ }
+
+ /// The match semantics to use for determinization.
+ ///
+ /// MatchKind::All corresponds to the standard textbook construction.
+ /// All possible match states are represented in the DFA.
+ /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
+ /// simulate the match semantics of backtracking regex engines. Namely,
+ /// only a subset of match states are built, and dead states are used to
+ /// stop searches with an unanchored prefix.
+ ///
+ /// The default is MatchKind::LeftmostFirst.
+ pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
+ self.match_kind = kind;
+ self
+ }
+
+ /// The set of bytes to use that will cause the DFA to enter a quit state,
+ /// stop searching and return an error. By default, this is empty.
+ pub fn quit(&mut self, set: ByteSet) -> &mut Config {
+ self.quit = set;
+ self
+ }
+
+ /// The limit, in bytes of the heap, that the DFA is permitted to use. This
+ /// does not include the auxiliary heap storage used by determinization.
+ pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
+ self.dfa_size_limit = bytes;
+ self
+ }
+
+ /// The limit, in bytes of the heap, that determinization itself is allowed
+ /// to use. This does not include the size of the DFA being built.
+ pub fn determinize_size_limit(
+ &mut self,
+ bytes: Option<usize>,
+ ) -> &mut Config {
+ self.determinize_size_limit = bytes;
+ self
+ }
+}
+
+/// The actual implementation of determinization that converts an NFA to a DFA
+/// through powerset construction.
+///
+/// This determinizer roughly follows the typical powerset construction, where
+/// each DFA state is comprised of one or more NFA states. In the worst case,
+/// there is one DFA state for every possible combination of NFA states. In
+/// practice, this only happens in certain conditions, typically when there are
+/// bounded repetitions.
+///
+/// The main differences between this implementation and typical deteminization
+/// are that this implementation delays matches by one state and hackily makes
+/// look-around work. Comments below attempt to explain this.
+///
+/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
+/// whichever is shorter.
+#[derive(Debug)]
+struct Runner<'a> {
+ /// The configuration used to initialize determinization.
+ config: Config,
+ /// The NFA we're converting into a DFA.
+ nfa: &'a thompson::NFA,
+ /// The DFA we're building.
+ dfa: &'a mut dense::OwnedDFA,
+ /// Each DFA state being built is defined as an *ordered* set of NFA
+ /// states, along with some meta facts about the ordered set of NFA states.
+ ///
+ /// This is never empty. The first state is always a dummy state such that
+ /// a state id == 0 corresponds to a dead state. The second state is always
+ /// the quit state.
+ ///
+ /// Why do we have states in both a `Vec` and in a cache map below?
+ /// Well, they serve two different roles based on access patterns.
+ /// `builder_states` is the canonical home of each state, and provides
+ /// constant random access by a DFA state's ID. The cache map below, on
+ /// the other hand, provides a quick way of searching for identical DFA
+ /// states by using the DFA state as a key in the map. Of course, we use
+ /// reference counting to avoid actually duplicating the state's data
+ /// itself. (Although this has never been benchmarked.) Note that the cache
+ /// map does not give us full minimization; it just lets us avoid some very
+ /// obvious redundant states.
+ ///
+ /// Note that the index into this Vec isn't quite the DFA's state ID.
+ /// Rather, it's just an index. To get the state ID, you have to multiply
+ /// it by the DFA's stride. That's done by self.dfa.from_index. And the
+ /// inverse is self.dfa.to_index.
+ ///
+ /// Moreover, DFA states don't usually retain the IDs assigned to them
+ /// by their position in this Vec. After determinization completes,
+ /// states are shuffled around to support other optimizations. See the
+ /// sibling 'special' module for more details on that. (The reason for
+ /// mentioning this is that if you print out the DFA for debugging during
+ /// determinization, and then print out the final DFA after it is fully
+ /// built, then the state IDs likely won't match up.)
+ builder_states: Vec<State>,
+ /// A cache of DFA states that already exist and can be easily looked up
+ /// via ordered sets of NFA states.
+ ///
+ /// See `builder_states` docs for why we store states in two different
+ /// ways.
+ cache: StateMap,
+ /// The memory usage, in bytes, used by builder_states and cache. We track
+ /// this as new states are added since states use a variable amount of
+ /// heap. Tracking this as we add states makes it possible to compute the
+ /// total amount of memory used by the determinizer in constant time.
+ memory_usage_state: usize,
+ /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
+ /// These are reused throughout determinization. A bounded sparse set
+ /// gives us constant time insertion, membership testing and clearing.
+ sparses: SparseSets,
+ /// Scratch space for a stack of NFA states to visit, for depth first
+ /// visiting without recursion.
+ stack: Vec<StateID>,
+ /// Scratch space for storing an ordered sequence of NFA states, for
+ /// amortizing allocation. This is principally useful for when we avoid
+ /// adding a new DFA state since it already exists. In order to detect this
+ /// case though, we still need an ordered set of NFA state IDs. So we use
+ /// this space to stage that ordered set before we know whether we need to
+ /// create a new DFA state or not.
+ scratch_state_builder: StateBuilderEmpty,
+}
+
+/// A map from states to state identifiers. When using std, we use a standard
+/// hashmap, since it's a bit faster for this use case. (Other maps, like
+/// one's based on FNV, have not yet been benchmarked.)
+///
+/// The main purpose of this map is to reuse states where possible. This won't
+/// fully minimize the DFA, but it works well in a lot of cases.
+#[cfg(feature = "std")]
+type StateMap = std::collections::HashMap<State, StateID>;
+#[cfg(not(feature = "std"))]
+type StateMap = BTreeMap<State, StateID>;
+
+impl<'a> Runner<'a> {
+ /// Build the DFA. If there was a problem constructing the DFA (e.g., if
+ /// the chosen state identifier representation is too small), then an error
+ /// is returned.
+ fn run(mut self) -> Result<(), BuildError> {
+ if self.nfa.look_set_any().contains_word_unicode()
+ && !self.config.quit.contains_range(0x80, 0xFF)
+ {
+ return Err(BuildError::unsupported_dfa_word_boundary_unicode());
+ }
+
+ // A sequence of "representative" bytes drawn from each equivalence
+ // class. These representative bytes are fed to the NFA to compute
+ // state transitions. This allows us to avoid re-computing state
+ // transitions for bytes that are guaranteed to produce identical
+ // results. Since computing the representatives needs to do a little
+ // work, we do it once here because we'll be iterating over them a lot.
+ let representatives: Vec<alphabet::Unit> =
+ self.dfa.byte_classes().representatives(..).collect();
+ // The set of all DFA state IDs that still need to have their
+ // transitions set. We start by seeding this with all starting states.
+ let mut uncompiled = alloc::vec![];
+ self.add_all_starts(&mut uncompiled)?;
+ while let Some(dfa_id) = uncompiled.pop() {
+ for &unit in &representatives {
+ if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
+ {
+ continue;
+ }
+ // In many cases, the state we transition to has already been
+ // computed. 'cached_state' will do the minimal amount of work
+ // to check this, and if it exists, immediately return an
+ // already existing state ID.
+ let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
+ self.dfa.set_transition(dfa_id, unit, next_dfa_id);
+ // If the state ID we got back is newly created, then we need
+ // to compile it, so add it to our uncompiled frontier.
+ if is_new {
+ uncompiled.push(next_dfa_id);
+ }
+ }
+ }
+ debug!(
+ "determinization complete, memory usage: {}, \
+ dense DFA size: {}, \
+ is reverse? {}",
+ self.memory_usage(),
+ self.dfa.memory_usage(),
+ self.nfa.is_reverse(),
+ );
+
+ // A map from DFA state ID to one or more NFA match IDs. Each NFA match
+ // ID corresponds to a distinct regex pattern that matches in the state
+ // corresponding to the key.
+ let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
+ self.cache.clear();
+ #[cfg(feature = "logging")]
+ let mut total_pat_len = 0;
+ for (i, state) in self.builder_states.into_iter().enumerate() {
+ if let Some(pat_ids) = state.match_pattern_ids() {
+ let id = self.dfa.to_state_id(i);
+ log! {
+ total_pat_len += pat_ids.len();
+ }
+ matches.insert(id, pat_ids);
+ }
+ }
+ log! {
+ use core::mem::size_of;
+ let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
+ let pats = total_pat_len * size_of::<PatternID>();
+ let mem = (matches.len() * per_elem) + pats;
+ log::debug!("matches map built, memory usage: {}", mem);
+ }
+ // At this point, we shuffle the "special" states in the final DFA.
+ // This permits a DFA's match loop to detect a match condition (among
+ // other things) by merely inspecting the current state's identifier,
+ // and avoids the need for any additional auxiliary storage.
+ self.dfa.shuffle(matches)?;
+ Ok(())
+ }
+
+ /// Return the identifier for the next DFA state given an existing DFA
+ /// state and an input byte. If the next DFA state already exists, then
+ /// return its identifier from the cache. Otherwise, build the state, cache
+ /// it and return its identifier.
+ ///
+ /// This routine returns a boolean indicating whether a new state was
+ /// built. If a new state is built, then the caller needs to add it to its
+ /// frontier of uncompiled DFA states to compute transitions for.
+ fn cached_state(
+ &mut self,
+ dfa_id: StateID,
+ unit: alphabet::Unit,
+ ) -> Result<(StateID, bool), BuildError> {
+ // Compute the set of all reachable NFA states, including epsilons.
+ let empty_builder = self.get_state_builder();
+ let builder = util::determinize::next(
+ self.nfa,
+ self.config.match_kind,
+ &mut self.sparses,
+ &mut self.stack,
+ &self.builder_states[self.dfa.to_index(dfa_id)],
+ unit,
+ empty_builder,
+ );
+ self.maybe_add_state(builder)
+ }
+
+ /// Compute the set of DFA start states and add their identifiers in
+ /// 'dfa_state_ids' (no duplicates are added).
+ fn add_all_starts(
+ &mut self,
+ dfa_state_ids: &mut Vec<StateID>,
+ ) -> Result<(), BuildError> {
+ // These should be the first states added.
+ assert!(dfa_state_ids.is_empty());
+ // We only want to add (un)anchored starting states that is consistent
+ // with our DFA's configuration. Unconditionally adding both (although
+ // it is the default) can make DFAs quite a bit bigger.
+ if self.dfa.start_kind().has_unanchored() {
+ self.add_start_group(Anchored::No, dfa_state_ids)?;
+ }
+ if self.dfa.start_kind().has_anchored() {
+ self.add_start_group(Anchored::Yes, dfa_state_ids)?;
+ }
+ // I previously has an 'assert' here checking that either
+ // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
+ // turns out this isn't always true. For example, the NFA might have
+ // one or more patterns but where all such patterns are just 'fail'
+ // states. These will ultimately just compile down to DFA dead states,
+ // and since the dead state was added earlier, no new DFA states are
+ // added. And thus, it is valid and okay for 'dfa_state_ids' to be
+ // empty even if there are a non-zero number of patterns in the NFA.
+
+ // We only need to compute anchored start states for each pattern if it
+ // was requested to do so.
+ if self.dfa.starts_for_each_pattern() {
+ for pid in self.nfa.patterns() {
+ self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Add a group of start states for the given match pattern ID. Any new
+ /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
+ /// pushed.)
+ ///
+ /// When pattern_id is None, then this will compile a group of unanchored
+ /// start states (if the DFA is unanchored). When the pattern_id is
+ /// present, then this will compile a group of anchored start states that
+ /// only match the given pattern.
+ ///
+ /// This panics if `anchored` corresponds to an invalid pattern ID.
+ fn add_start_group(
+ &mut self,
+ anchored: Anchored,
+ dfa_state_ids: &mut Vec<StateID>,
+ ) -> Result<(), BuildError> {
+ let nfa_start = match anchored {
+ Anchored::No => self.nfa.start_unanchored(),
+ Anchored::Yes => self.nfa.start_anchored(),
+ Anchored::Pattern(pid) => {
+ self.nfa.start_pattern(pid).expect("valid pattern ID")
+ }
+ };
+
+ // When compiling start states, we're careful not to build additional
+ // states that aren't necessary. For example, if the NFA has no word
+ // boundary assertion, then there's no reason to have distinct start
+ // states for 'NonWordByte' and 'WordByte' starting configurations.
+ // Instead, the 'WordByte' starting configuration can just point
+ // directly to the start state for the 'NonWordByte' config.
+ //
+ // Note though that we only need to care about assertions in the prefix
+ // of an NFA since this only concerns the starting states. (Actually,
+ // the most precisely thing we could do it is look at the prefix
+ // assertions of each pattern when 'anchored == Anchored::Pattern',
+ // and then only compile extra states if the prefix is non-empty.) But
+ // we settle for simplicity here instead of absolute minimalism. It is
+ // somewhat rare, after all, for multiple patterns in the same regex to
+ // have different prefix look-arounds.
+
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::NonWordByte)?;
+ self.dfa.set_start_state(anchored, Start::NonWordByte, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ if !self.nfa.look_set_prefix_any().contains_word() {
+ self.dfa.set_start_state(anchored, Start::WordByte, id);
+ } else {
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::WordByte)?;
+ self.dfa.set_start_state(anchored, Start::WordByte, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+ }
+ if !self.nfa.look_set_prefix_any().contains_anchor() {
+ self.dfa.set_start_state(anchored, Start::Text, id);
+ self.dfa.set_start_state(anchored, Start::LineLF, id);
+ self.dfa.set_start_state(anchored, Start::LineCR, id);
+ self.dfa.set_start_state(
+ anchored,
+ Start::CustomLineTerminator,
+ id,
+ );
+ } else {
+ let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
+ self.dfa.set_start_state(anchored, Start::Text, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
+ self.dfa.set_start_state(anchored, Start::LineLF, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
+ self.dfa.set_start_state(anchored, Start::LineCR, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
+ self.dfa.set_start_state(
+ anchored,
+ Start::CustomLineTerminator,
+ id,
+ );
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Add a new DFA start state corresponding to the given starting NFA
+ /// state, and the starting search configuration. (The starting search
+ /// configuration essentially tells us which look-behind assertions are
+ /// true for this particular state.)
+ ///
+ /// The boolean returned indicates whether the state ID returned is a newly
+ /// created state, or a previously cached state.
+ fn add_one_start(
+ &mut self,
+ nfa_start: StateID,
+ start: Start,
+ ) -> Result<(StateID, bool), BuildError> {
+ // Compute the look-behind assertions that are true in this starting
+ // configuration, and the determine the epsilon closure. While
+ // computing the epsilon closure, we only follow condiional epsilon
+ // transitions that satisfy the look-behind assertions in 'look_have'.
+ let mut builder_matches = self.get_state_builder().into_matches();
+ util::determinize::set_lookbehind_from_start(
+ self.nfa,
+ &start,
+ &mut builder_matches,
+ );
+ self.sparses.set1.clear();
+ util::determinize::epsilon_closure(
+ self.nfa,
+ nfa_start,
+ builder_matches.look_have(),
+ &mut self.stack,
+ &mut self.sparses.set1,
+ );
+ let mut builder = builder_matches.into_nfa();
+ util::determinize::add_nfa_states(
+ &self.nfa,
+ &self.sparses.set1,
+ &mut builder,
+ );
+ self.maybe_add_state(builder)
+ }
+
+ /// Adds the given state to the DFA being built depending on whether it
+ /// already exists in this determinizer's cache.
+ ///
+ /// If it does exist, then the memory used by 'state' is put back into the
+ /// determinizer and the previously created state's ID is returned. (Along
+ /// with 'false', indicating that no new state was added.)
+ ///
+ /// If it does not exist, then the state is added to the DFA being built
+ /// and a fresh ID is allocated (if ID allocation fails, then an error is
+ /// returned) and returned. (Along with 'true', indicating that a new state
+ /// was added.)
+ fn maybe_add_state(
+ &mut self,
+ builder: StateBuilderNFA,
+ ) -> Result<(StateID, bool), BuildError> {
+ if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
+ // Since we have a cached state, put the constructed state's
+ // memory back into our scratch space, so that it can be reused.
+ self.put_state_builder(builder);
+ return Ok((cached_id, false));
+ }
+ self.add_state(builder).map(|sid| (sid, true))
+ }
+
+ /// Add the given state to the DFA and make it available in the cache.
+ ///
+ /// The state initially has no transitions. That is, it transitions to the
+ /// dead state for all possible inputs, and transitions to the quit state
+ /// for all quit bytes.
+ ///
+ /// If adding the state would exceed the maximum value for StateID, then an
+ /// error is returned.
+ fn add_state(
+ &mut self,
+ builder: StateBuilderNFA,
+ ) -> Result<StateID, BuildError> {
+ let id = self.dfa.add_empty_state()?;
+ if !self.config.quit.is_empty() {
+ for b in self.config.quit.iter() {
+ self.dfa.set_transition(
+ id,
+ alphabet::Unit::u8(b),
+ self.dfa.quit_id(),
+ );
+ }
+ }
+ let state = builder.to_state();
+ // States use reference counting internally, so we only need to count
+ // their memory usage once.
+ self.memory_usage_state += state.memory_usage();
+ self.builder_states.push(state.clone());
+ self.cache.insert(state, id);
+ self.put_state_builder(builder);
+ if let Some(limit) = self.config.dfa_size_limit {
+ if self.dfa.memory_usage() > limit {
+ return Err(BuildError::dfa_exceeded_size_limit(limit));
+ }
+ }
+ if let Some(limit) = self.config.determinize_size_limit {
+ if self.memory_usage() > limit {
+ return Err(BuildError::determinize_exceeded_size_limit(
+ limit,
+ ));
+ }
+ }
+ Ok(id)
+ }
+
+ /// Returns a state builder from this determinizer that might have existing
+ /// capacity. This helps avoid allocs in cases where a state is built that
+ /// turns out to already be cached.
+ ///
+ /// Callers must put the state builder back with 'put_state_builder',
+ /// otherwise the allocation reuse won't work.
+ fn get_state_builder(&mut self) -> StateBuilderEmpty {
+ core::mem::replace(
+ &mut self.scratch_state_builder,
+ StateBuilderEmpty::new(),
+ )
+ }
+
+ /// Puts the given state builder back into this determinizer for reuse.
+ ///
+ /// Note that building a 'State' from a builder always creates a new
+ /// alloc, so callers should always put the builder back.
+ fn put_state_builder(&mut self, builder: StateBuilderNFA) {
+ let _ = core::mem::replace(
+ &mut self.scratch_state_builder,
+ builder.clear(),
+ );
+ }
+
+ /// Return the memory usage, in bytes, of this determinizer at the current
+ /// point in time. This does not include memory used by the NFA or the
+ /// dense DFA itself.
+ fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ self.builder_states.len() * size_of::<State>()
+ // Maps likely use more memory than this, but it's probably close.
+ + self.cache.len() * (size_of::<State>() + size_of::<StateID>())
+ + self.memory_usage_state
+ + self.stack.capacity() * size_of::<StateID>()
+ + self.scratch_state_builder.capacity()
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/minimize.rs b/third_party/rust/regex-automata/src/dfa/minimize.rs
new file mode 100644
index 0000000000..fea925bdc6
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/minimize.rs
@@ -0,0 +1,463 @@
+use core::{cell::RefCell, fmt, mem};
+
+use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
+
+use crate::{
+ dfa::{automaton::Automaton, dense, DEAD},
+ util::{
+ alphabet,
+ primitives::{PatternID, StateID},
+ },
+};
+
+/// An implementation of Hopcroft's algorithm for minimizing DFAs.
+///
+/// The algorithm implemented here is mostly taken from Wikipedia:
+/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
+///
+/// This code has had some light optimization attention paid to it,
+/// particularly in the form of reducing allocation as much as possible.
+/// However, it is still generally slow. Future optimization work should
+/// probably focus on the bigger picture rather than micro-optimizations. For
+/// example:
+///
+/// 1. Figure out how to more intelligently create initial partitions. That is,
+/// Hopcroft's algorithm starts by creating two partitions of DFA states
+/// that are known to NOT be equivalent: match states and non-match states.
+/// The algorithm proceeds by progressively refining these partitions into
+/// smaller partitions. If we could start with more partitions, then we
+/// could reduce the amount of work that Hopcroft's algorithm needs to do.
+/// 2. For every partition that we visit, we find all incoming transitions to
+/// every state in the partition for *every* element in the alphabet. (This
+/// is why using byte classes can significantly decrease minimization times,
+/// since byte classes shrink the alphabet.) This is quite costly and there
+/// is perhaps some redundant work being performed depending on the specific
+/// states in the set. For example, we might be able to only visit some
+/// elements of the alphabet based on the transitions.
+/// 3. Move parts of minimization into determinization. If minimization has
+/// fewer states to deal with, then it should run faster. A prime example
+/// of this might be large Unicode classes, which are generated in way that
+/// can create a lot of redundant states. (Some work has been done on this
+/// point during NFA compilation via the algorithm described in the
+/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
+/// paper.)
+pub(crate) struct Minimizer<'a> {
+ dfa: &'a mut dense::OwnedDFA,
+ in_transitions: Vec<Vec<Vec<StateID>>>,
+ partitions: Vec<StateSet>,
+ waiting: Vec<StateSet>,
+}
+
+impl<'a> fmt::Debug for Minimizer<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.debug_struct("Minimizer")
+ .field("dfa", &self.dfa)
+ .field("in_transitions", &self.in_transitions)
+ .field("partitions", &self.partitions)
+ .field("waiting", &self.waiting)
+ .finish()
+ }
+}
+
+/// A set of states. A state set makes up a single partition in Hopcroft's
+/// algorithm.
+///
+/// It is represented by an ordered set of state identifiers. We use shared
+/// ownership so that a single state set can be in both the set of partitions
+/// and in the set of waiting sets simultaneously without an additional
+/// allocation. Generally, once a state set is built, it becomes immutable.
+///
+/// We use this representation because it avoids the overhead of more
+/// traditional set data structures (HashSet/BTreeSet), and also because
+/// computing intersection/subtraction on this representation is especially
+/// fast.
+#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
+struct StateSet {
+ ids: Rc<RefCell<Vec<StateID>>>,
+}
+
+impl<'a> Minimizer<'a> {
+ pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
+ let in_transitions = Minimizer::incoming_transitions(dfa);
+ let partitions = Minimizer::initial_partitions(dfa);
+ let waiting = partitions.clone();
+ Minimizer { dfa, in_transitions, partitions, waiting }
+ }
+
+ pub fn run(mut self) {
+ let stride2 = self.dfa.stride2();
+ let as_state_id = |index: usize| -> StateID {
+ StateID::new(index << stride2).unwrap()
+ };
+ let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
+
+ let mut incoming = StateSet::empty();
+ let mut scratch1 = StateSet::empty();
+ let mut scratch2 = StateSet::empty();
+ let mut newparts = vec![];
+
+ // This loop is basically Hopcroft's algorithm. Everything else is just
+ // shuffling data around to fit our representation.
+ while let Some(set) = self.waiting.pop() {
+ for b in self.dfa.byte_classes().iter() {
+ self.find_incoming_to(b, &set, &mut incoming);
+ // If incoming is empty, then the intersection with any other
+ // set must also be empty. So 'newparts' just ends up being
+ // 'self.partitions'. So there's no need to go through the loop
+ // below.
+ //
+ // This actually turns out to be rather large optimization. On
+ // the order of making minimization 4-5x faster. It's likely
+ // that the vast majority of all states have very few incoming
+ // transitions.
+ if incoming.is_empty() {
+ continue;
+ }
+
+ for p in 0..self.partitions.len() {
+ self.partitions[p].intersection(&incoming, &mut scratch1);
+ if scratch1.is_empty() {
+ newparts.push(self.partitions[p].clone());
+ continue;
+ }
+
+ self.partitions[p].subtract(&incoming, &mut scratch2);
+ if scratch2.is_empty() {
+ newparts.push(self.partitions[p].clone());
+ continue;
+ }
+
+ let (x, y) =
+ (scratch1.deep_clone(), scratch2.deep_clone());
+ newparts.push(x.clone());
+ newparts.push(y.clone());
+ match self.find_waiting(&self.partitions[p]) {
+ Some(i) => {
+ self.waiting[i] = x;
+ self.waiting.push(y);
+ }
+ None => {
+ if x.len() <= y.len() {
+ self.waiting.push(x);
+ } else {
+ self.waiting.push(y);
+ }
+ }
+ }
+ }
+ newparts = mem::replace(&mut self.partitions, newparts);
+ newparts.clear();
+ }
+ }
+
+ // At this point, we now have a minimal partitioning of states, where
+ // each partition is an equivalence class of DFA states. Now we need to
+ // use this partitioning to update the DFA to only contain one state for
+ // each partition.
+
+ // Create a map from DFA state ID to the representative ID of the
+ // equivalence class to which it belongs. The representative ID of an
+ // equivalence class of states is the minimum ID in that class.
+ let mut state_to_part = vec![DEAD; self.dfa.state_len()];
+ for p in &self.partitions {
+ p.iter(|id| state_to_part[as_index(id)] = p.min());
+ }
+
+ // Generate a new contiguous sequence of IDs for minimal states, and
+ // create a map from equivalence IDs to the new IDs. Thus, the new
+ // minimal ID of *any* state in the unminimized DFA can be obtained
+ // with minimals_ids[state_to_part[old_id]].
+ let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
+ let mut new_index = 0;
+ for state in self.dfa.states() {
+ if state_to_part[as_index(state.id())] == state.id() {
+ minimal_ids[as_index(state.id())] = as_state_id(new_index);
+ new_index += 1;
+ }
+ }
+ // The total number of states in the minimal DFA.
+ let minimal_count = new_index;
+ // Convenience function for remapping state IDs. This takes an old ID,
+ // looks up its Hopcroft partition and then maps that to the new ID
+ // range.
+ let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
+
+ // Re-map this DFA in place such that the only states remaining
+ // correspond to the representative states of every equivalence class.
+ for id in (0..self.dfa.state_len()).map(as_state_id) {
+ // If this state isn't a representative for an equivalence class,
+ // then we skip it since it won't appear in the minimal DFA.
+ if state_to_part[as_index(id)] != id {
+ continue;
+ }
+ self.dfa.remap_state(id, remap);
+ self.dfa.swap_states(id, minimal_ids[as_index(id)]);
+ }
+ // Trim off all unused states from the pre-minimized DFA. This
+ // represents all states that were merged into a non-singleton
+ // equivalence class of states, and appeared after the first state
+ // in each such class. (Because the state with the smallest ID in each
+ // equivalence class is its representative ID.)
+ self.dfa.truncate_states(minimal_count);
+
+ // Update the new start states, which is now just the minimal ID of
+ // whatever state the old start state was collapsed into. Also, we
+ // collect everything before-hand to work around the borrow checker.
+ // We're already allocating so much that this is probably fine. If this
+ // turns out to be costly, then I guess add a `starts_mut` iterator.
+ let starts: Vec<_> = self.dfa.starts().collect();
+ for (old_start_id, anchored, start_type) in starts {
+ self.dfa.set_start_state(
+ anchored,
+ start_type,
+ remap(old_start_id),
+ );
+ }
+
+ // Update the match state pattern ID list for multi-regexes. All we
+ // need to do is remap the match state IDs. The pattern ID lists are
+ // always the same as they were since match states with distinct
+ // pattern ID lists are always considered distinct states.
+ let mut pmap = BTreeMap::new();
+ for (match_id, pattern_ids) in self.dfa.pattern_map() {
+ let new_id = remap(match_id);
+ pmap.insert(new_id, pattern_ids);
+ }
+ // This unwrap is OK because minimization never increases the number of
+ // match states or patterns in those match states. Since minimization
+ // runs after the pattern map has already been set at least once, we
+ // know that our match states cannot error.
+ self.dfa.set_pattern_map(&pmap).unwrap();
+
+ // In order to update the ID of the maximum match state, we need to
+ // find the maximum ID among all of the match states in the minimized
+ // DFA. This is not necessarily the new ID of the unminimized maximum
+ // match state, since that could have been collapsed with a much
+ // earlier match state. Therefore, to find the new max match state,
+ // we iterate over all previous match states, find their corresponding
+ // new minimal ID, and take the maximum of those.
+ let old = self.dfa.special().clone();
+ let new = self.dfa.special_mut();
+ // ... but only remap if we had match states.
+ if old.matches() {
+ new.min_match = StateID::MAX;
+ new.max_match = StateID::ZERO;
+ for i in as_index(old.min_match)..=as_index(old.max_match) {
+ let new_id = remap(as_state_id(i));
+ if new_id < new.min_match {
+ new.min_match = new_id;
+ }
+ if new_id > new.max_match {
+ new.max_match = new_id;
+ }
+ }
+ }
+ // ... same, but for start states.
+ if old.starts() {
+ new.min_start = StateID::MAX;
+ new.max_start = StateID::ZERO;
+ for i in as_index(old.min_start)..=as_index(old.max_start) {
+ let new_id = remap(as_state_id(i));
+ if new_id == DEAD {
+ continue;
+ }
+ if new_id < new.min_start {
+ new.min_start = new_id;
+ }
+ if new_id > new.max_start {
+ new.max_start = new_id;
+ }
+ }
+ if new.max_start == DEAD {
+ new.min_start = DEAD;
+ }
+ }
+ new.quit_id = remap(new.quit_id);
+ new.set_max();
+ }
+
+ fn find_waiting(&self, set: &StateSet) -> Option<usize> {
+ self.waiting.iter().position(|s| s == set)
+ }
+
+ fn find_incoming_to(
+ &self,
+ b: alphabet::Unit,
+ set: &StateSet,
+ incoming: &mut StateSet,
+ ) {
+ incoming.clear();
+ set.iter(|id| {
+ for &inid in
+ &self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
+ {
+ incoming.add(inid);
+ }
+ });
+ incoming.canonicalize();
+ }
+
+ fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
+ // For match states, we know that two match states with different
+ // pattern ID lists will *always* be distinct, so we can partition them
+ // initially based on that.
+ let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
+ let mut is_quit = StateSet::empty();
+ let mut no_match = StateSet::empty();
+ for state in dfa.states() {
+ if dfa.is_match_state(state.id()) {
+ let mut pids = vec![];
+ for i in 0..dfa.match_len(state.id()) {
+ pids.push(dfa.match_pattern(state.id(), i));
+ }
+ matching
+ .entry(pids)
+ .or_insert(StateSet::empty())
+ .add(state.id());
+ } else if dfa.is_quit_state(state.id()) {
+ is_quit.add(state.id());
+ } else {
+ no_match.add(state.id());
+ }
+ }
+
+ let mut sets: Vec<StateSet> =
+ matching.into_iter().map(|(_, set)| set).collect();
+ sets.push(no_match);
+ sets.push(is_quit);
+ sets
+ }
+
+ fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
+ let mut incoming = vec![];
+ for _ in dfa.states() {
+ incoming.push(vec![vec![]; dfa.alphabet_len()]);
+ }
+ for state in dfa.states() {
+ for (b, next) in state.transitions() {
+ incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
+ }
+ }
+ incoming
+ }
+}
+
+impl StateSet {
+ fn empty() -> StateSet {
+ StateSet { ids: Rc::new(RefCell::new(vec![])) }
+ }
+
+ fn add(&mut self, id: StateID) {
+ self.ids.borrow_mut().push(id);
+ }
+
+ fn min(&self) -> StateID {
+ self.ids.borrow()[0]
+ }
+
+ fn canonicalize(&mut self) {
+ self.ids.borrow_mut().sort();
+ self.ids.borrow_mut().dedup();
+ }
+
+ fn clear(&mut self) {
+ self.ids.borrow_mut().clear();
+ }
+
+ fn len(&self) -> usize {
+ self.ids.borrow().len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ fn deep_clone(&self) -> StateSet {
+ let ids = self.ids.borrow().iter().cloned().collect();
+ StateSet { ids: Rc::new(RefCell::new(ids)) }
+ }
+
+ fn iter<F: FnMut(StateID)>(&self, mut f: F) {
+ for &id in self.ids.borrow().iter() {
+ f(id);
+ }
+ }
+
+ fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
+ dest.clear();
+ if self.is_empty() || other.is_empty() {
+ return;
+ }
+
+ let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
+ let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
+ let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
+ loop {
+ if a == b {
+ dest.add(a);
+ a = match ita.next() {
+ None => break,
+ Some(a) => a,
+ };
+ b = match itb.next() {
+ None => break,
+ Some(b) => b,
+ };
+ } else if a < b {
+ a = match ita.next() {
+ None => break,
+ Some(a) => a,
+ };
+ } else {
+ b = match itb.next() {
+ None => break,
+ Some(b) => b,
+ };
+ }
+ }
+ }
+
+ fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
+ dest.clear();
+ if self.is_empty() || other.is_empty() {
+ self.iter(|s| dest.add(s));
+ return;
+ }
+
+ let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
+ let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
+ let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
+ loop {
+ if a == b {
+ a = match ita.next() {
+ None => break,
+ Some(a) => a,
+ };
+ b = match itb.next() {
+ None => {
+ dest.add(a);
+ break;
+ }
+ Some(b) => b,
+ };
+ } else if a < b {
+ dest.add(a);
+ a = match ita.next() {
+ None => break,
+ Some(a) => a,
+ };
+ } else {
+ b = match itb.next() {
+ None => {
+ dest.add(a);
+ break;
+ }
+ Some(b) => b,
+ };
+ }
+ }
+ for a in ita {
+ dest.add(a);
+ }
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/mod.rs b/third_party/rust/regex-automata/src/dfa/mod.rs
new file mode 100644
index 0000000000..4bb8704352
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/mod.rs
@@ -0,0 +1,360 @@
+/*!
+A module for building and searching with deterministic finite automata (DFAs).
+
+Like other modules in this crate, DFAs support a rich regex syntax with Unicode
+features. DFAs also have extensive options for configuring the best space vs
+time trade off for your use case and provides support for cheap deserialization
+of automata for use in `no_std` environments.
+
+If you're looking for lazy DFAs that build themselves incrementally during
+search, then please see the top-level [`hybrid` module](crate::hybrid).
+
+# Overview
+
+This section gives a brief overview of the primary types in this module:
+
+* A [`regex::Regex`] provides a way to search for matches of a regular
+expression using DFAs. This includes iterating over matches with both the start
+and end positions of each match.
+* A [`dense::DFA`] provides low level access to a DFA that uses a dense
+representation (uses lots of space, but fast searching).
+* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse
+representation (uses less space, but slower searching).
+* An [`Automaton`] trait that defines an interface that both dense and sparse
+DFAs implement. (A `regex::Regex` is generic over this trait.)
+* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g.,
+[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g.,
+[`dense::DFA::from_bytes`]).
+
+There is also a [`onepass`] module that provides a [one-pass
+DFA](onepass::DFA). The unique advantage of this DFA is that, for the class
+of regexes it can be built with, it supports reporting the spans of matching
+capturing groups. It is the only DFA in this crate capable of such a thing.
+
+# Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```
+use regex_automata::{Match, dfa::regex::Regex};
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<Match> = re.find_iter(text).collect();
+assert_eq!(matches, vec![
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: searching with regex sets
+
+The DFAs in this module all fully support searching with multiple regexes
+simultaneously. You can use this support with standard leftmost-first style
+searching to find non-overlapping matches:
+
+```
+# if cfg!(miri) { return Ok(()); } // miri takes too long
+use regex_automata::{Match, dfa::regex::Regex};
+
+let re = Regex::new_many(&[r"\w+", r"\S+"])?;
+let text = b"@foo bar";
+let matches: Vec<Match> = re.find_iter(text).collect();
+assert_eq!(matches, vec![
+ Match::must(1, 0..4),
+ Match::must(0, 5..8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: use sparse DFAs
+
+By default, compiling a regex will use dense DFAs internally. This uses more
+memory, but executes searches more quickly. If you can abide slower searches
+(somewhere around 3-5x), then sparse DFAs might make more sense since they can
+use significantly less space.
+
+Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
+`Regex::new`:
+
+```
+use regex_automata::{Match, dfa::regex::Regex};
+
+let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<Match> = re.find_iter(text).collect();
+assert_eq!(matches, vec![
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+If you already have dense DFAs for some reason, they can be converted to sparse
+DFAs and used to build a new `Regex`. For example:
+
+```
+use regex_automata::{Match, dfa::regex::Regex};
+
+let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+let sparse_re = Regex::builder().build_from_dfas(
+ dense_re.forward().to_sparse()?,
+ dense_re.reverse().to_sparse()?,
+);
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<Match> = sparse_re.find_iter(text).collect();
+assert_eq!(matches, vec![
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: deserialize a DFA
+
+This shows how to first serialize a DFA into raw bytes, and then deserialize
+those raw bytes back into a DFA. While this particular example is a
+bit contrived, this same technique can be used in your program to
+deserialize a DFA at start up time or by memory mapping a file.
+
+```
+use regex_automata::{Match, dfa::{dense, regex::Regex}};
+
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both the forward and reverse DFAs, see note below
+let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian();
+let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian();
+// now deserialize both---we need to specify the correct type!
+let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0;
+let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0;
+// finally, reconstruct our regex
+let re2 = Regex::builder().build_from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<Match> = re2.find_iter(text).collect();
+assert_eq!(matches, vec![
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+There are a few points worth noting here:
+
+* We need to extract the raw DFAs used by the regex and serialize those. You
+can build the DFAs manually yourself using [`dense::Builder`], but using
+the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In
+particular, a `Regex` constructs a reverse DFA for finding the starting
+location of matches.)
+* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method.
+In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`]
+or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're
+deserializing your DFA from. If you intend to deserialize on either platform,
+then you'll need to serialize both and deserialize the right one depending on
+your target's endianness.
+* Safely deserializing a DFA requires verifying the raw bytes, particularly if
+they are untrusted, since an invalid DFA could cause logical errors, panics
+or even undefined behavior. This verification step requires visiting all of
+the transitions in the DFA, which can be costly. If cheaper verification is
+desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does
+verification that can be performed in constant time. However, one can only use
+this routine if the caller can guarantee that the bytes provided encoded a
+valid DFA.
+
+The same process can be achieved with sparse DFAs as well:
+
+```
+use regex_automata::{Match, dfa::{sparse, regex::Regex}};
+
+let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
+// serialize both
+let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian();
+let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian();
+// now deserialize both---we need to specify the correct type!
+let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0;
+let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0;
+// finally, reconstruct our regex
+let re2 = Regex::builder().build_from_dfas(fwd, rev);
+
+// we can use it like normal
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<Match> = re2.find_iter(text).collect();
+assert_eq!(matches, vec![
+ Match::must(0, 0..10),
+ Match::must(0, 11..21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
+Conversely, dense DFAs must be be aligned to the same alignment as a
+[`StateID`](crate::util::primitives::StateID).
+
+# Support for `no_std` and `alloc`-only
+
+This crate comes with `alloc` and `std` features that are enabled by default.
+When the `alloc` or `std` features are enabled, the API of this module will
+include the facilities necessary for compiling, serializing, deserializing
+and searching with DFAs. When only the `alloc` feature is enabled, then
+implementations of the `std::error::Error` trait are dropped, but everything
+else generally remains the same. When both the `alloc` and `std` features are
+disabled, the API of this module will shrink such that it only includes the
+facilities necessary for deserializing and searching with DFAs.
+
+The intended workflow for `no_std` environments is thus as follows:
+
+* Write a program with the `alloc` or `std` features that compiles and
+serializes a regular expression. You may need to serialize both little and big
+endian versions of each DFA. (So that's 4 DFAs in total for each regex.)
+* In your `no_std` environment, follow the examples above for deserializing
+your previously serialized DFAs into regexes. You can then search with them as
+you would any regex.
+
+Deserialization can happen anywhere. For example, with bytes embedded into a
+binary or with a file memory mapped at runtime.
+
+The `regex-cli` command (found in the same repository as this crate) can be
+used to serialize DFAs to files and generate Rust code to read them.
+
+# Syntax
+
+This module supports the same syntax as the `regex` crate, since they share the
+same parser. You can find an exhaustive list of supported syntax in the
+[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
+
+There are two things that are not supported by the DFAs in this module:
+
+* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
+of them) can only find the offsets of an entire match, but cannot resolve
+the offsets of each capturing group. This is because DFAs do not have the
+expressive power necessary.
+* Unicode word boundaries. These present particularly difficult challenges for
+DFA construction and would result in an explosion in the number of states.
+One can enable [`dense::Config::unicode_word_boundary`] though, which provides
+heuristic support for Unicode word boundaries that only works on ASCII text.
+Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
+on any input.
+
+There are no plans to lift either of these limitations.
+
+Note that these restrictions are identical to the restrictions on lazy DFAs.
+
+# Differences with general purpose regexes
+
+The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
+general purpose regular expression engine. It aims to automatically balance low
+compile times, fast search times and low memory usage, while also providing
+a convenient API for users. In contrast, this module provides a lower level
+regular expression interface based exclusively on DFAs that is a bit less
+convenient while providing more explicit control over memory usage and search
+times.
+
+Here are some specific negative differences:
+
+* **Compilation can take an exponential amount of time and space** in the size
+of the regex pattern. While most patterns do not exhibit worst case exponential
+time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA
+with approximately `2^(N+2)` states. For this reason, untrusted patterns should
+not be compiled with this module. (In the future, the API may expose an option
+to return an error if the DFA gets too big.)
+* This module does not support sub-match extraction via capturing groups, which
+can be achieved with the regex crate's "captures" API.
+* While the regex crate doesn't necessarily sport fast compilation times,
+the regexes in this module are almost universally slow to compile, especially
+when they contain large Unicode character classes. For example, on my system,
+compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling
+a sparse regex takes about the same time but only uses about 1.2MB of
+memory.) Conversely, compiling the same regex without Unicode support, e.g.,
+`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this
+reason, you should only use Unicode character classes if you absolutely need
+them! (They are enabled by default though.)
+* This module does not support Unicode word boundaries. ASCII word bondaries
+may be used though by disabling Unicode or selectively doing so in the syntax,
+e.g., `(?-u:\b)`. There is also an option to
+[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary),
+where the corresponding DFA will give up if any non-ASCII byte is seen.
+* As a lower level API, this module does not do literal optimizations
+automatically. Although it does provide hooks in its API to make use of the
+[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal
+optimizations means that searches may run much slower than what you're
+accustomed to, although, it does provide more predictable and consistent
+performance.
+* There is no `&str` API like in the regex crate. In this module, all APIs
+operate on `&[u8]`. By default, match indices are
+guaranteed to fall on UTF-8 boundaries, unless either of
+[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or
+[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled.
+
+With some of the downsides out of the way, here are some positive differences:
+
+* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
+deserialized. Deserialization can be done in constant time with the unchecked
+APIs, since searching can be performed directly on the raw serialized bytes of
+a DFA.
+* This module was specifically designed so that the searching phase of a
+DFA has minimal runtime requirements, and can therefore be used in `no_std`
+environments. While `no_std` environments cannot compile regexes, they can
+deserialize pre-compiled regexes.
+* Since this module builds DFAs ahead of time, it will generally out-perform
+the `regex` crate on equivalent tasks. The performance difference is likely
+not large. However, because of a complex set of optimizations in the regex
+crate (like literal optimizations), an accurate performance comparison may be
+difficult to do.
+* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
+performance a small amount, but uses much less storage space. Potentially even
+less than what the regex crate uses.
+* This module exposes DFAs directly, such as [`dense::DFA`] and
+[`sparse::DFA`], which enables one to do less work in some cases. For example,
+if you only need the end of a match and not the start of a match, then you can
+use a DFA directly without building a `Regex`, which always requires a second
+DFA to find the start of a match.
+* This module provides more control over memory usage. Aside from choosing
+between dense and sparse DFAs, one can also choose a smaller state identifier
+representation to use less space. Also, one can enable DFA minimization
+via [`dense::Config::minimize`], but it can increase compilation times
+dramatically.
+*/
+
+#[cfg(feature = "dfa-search")]
+pub use crate::dfa::{
+ automaton::{Automaton, OverlappingState},
+ start::StartKind,
+};
+
+/// This is an alias for a state ID of zero. It has special significance
+/// because it always corresponds to the first state in a DFA, and the first
+/// state in a DFA is always "dead." That is, the dead state always has all
+/// of its transitions set to itself. Moreover, the dead state is used as a
+/// sentinel for various things. e.g., In search, reaching a dead state means
+/// that the search must stop.
+const DEAD: crate::util::primitives::StateID =
+ crate::util::primitives::StateID::ZERO;
+
+#[cfg(feature = "dfa-search")]
+pub mod dense;
+#[cfg(feature = "dfa-onepass")]
+pub mod onepass;
+#[cfg(feature = "dfa-search")]
+pub mod regex;
+#[cfg(feature = "dfa-search")]
+pub mod sparse;
+
+#[cfg(feature = "dfa-search")]
+pub(crate) mod accel;
+#[cfg(feature = "dfa-search")]
+mod automaton;
+#[cfg(feature = "dfa-build")]
+mod determinize;
+#[cfg(feature = "dfa-build")]
+mod minimize;
+#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))]
+mod remapper;
+#[cfg(feature = "dfa-search")]
+mod search;
+#[cfg(feature = "dfa-search")]
+mod special;
+#[cfg(feature = "dfa-search")]
+mod start;
diff --git a/third_party/rust/regex-automata/src/dfa/onepass.rs b/third_party/rust/regex-automata/src/dfa/onepass.rs
new file mode 100644
index 0000000000..44691d0c8a
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/onepass.rs
@@ -0,0 +1,3188 @@
+/*!
+A DFA that can return spans for matching capturing groups.
+
+This module is the home of a [one-pass DFA](DFA).
+
+This module also contains a [`Builder`] and a [`Config`] for building and
+configuring a one-pass DFA.
+*/
+
+// A note on naming and credit:
+//
+// As far as I know, Russ Cox came up with the practical vision and
+// implementation of a "one-pass regex engine." He mentions and describes it
+// briefly in the third article of his regexp article series:
+// https://swtch.com/~rsc/regexp/regexp3.html
+//
+// Cox's implementation is in RE2, and the implementation below is most
+// heavily inspired by RE2's. The key thing they have in common is that
+// their transitions are defined over an alphabet of bytes. In contrast,
+// Go's regex engine also has a one-pass engine, but its transitions are
+// more firmly rooted on Unicode codepoints. The ideas are the same, but the
+// implementations are different.
+//
+// RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA."
+// They're both true in their own ways:
+//
+// * The "one-pass" criterion is generally a property of the NFA itself. In
+// particular, it is said that an NFA is one-pass if, after each byte of input
+// during a search, there is at most one "VM thread" remaining to take for the
+// next byte of input. That is, there is never any ambiguity as to the path to
+// take through the NFA during a search.
+//
+// * On the other hand, once a one-pass NFA has its representation converted
+// to something where a constant number of instructions is used for each byte
+// of input, the implementation looks a lot more like a DFA. It's technically
+// more powerful than a DFA since it has side effects (storing offsets inside
+// of slots activated by a transition), but it is far closer to a DFA than an
+// NFA simulation.
+//
+// Thus, in this crate, we call it a one-pass DFA.
+
+use alloc::{vec, vec::Vec};
+
+use crate::{
+ dfa::{remapper::Remapper, DEAD},
+ nfa::thompson::{self, NFA},
+ util::{
+ alphabet::ByteClasses,
+ captures::Captures,
+ escape::DebugByte,
+ int::{Usize, U32, U64, U8},
+ look::{Look, LookSet, UnicodeWordBoundaryError},
+ primitives::{NonMaxUsize, PatternID, StateID},
+ search::{Anchored, Input, Match, MatchError, MatchKind, Span},
+ sparse_set::SparseSet,
+ },
+};
+
+/// The configuration used for building a [one-pass DFA](DFA).
+///
+/// A one-pass DFA configuration is a simple data object that is typically used
+/// with [`Builder::configure`]. It can be cheaply cloned.
+///
+/// A default configuration can be created either with `Config::new`, or
+/// perhaps more conveniently, with [`DFA::config`].
+#[derive(Clone, Debug, Default)]
+pub struct Config {
+ match_kind: Option<MatchKind>,
+ starts_for_each_pattern: Option<bool>,
+ byte_classes: Option<bool>,
+ size_limit: Option<Option<usize>>,
+}
+
+impl Config {
+ /// Return a new default one-pass DFA configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+ /// match semantics of Perl-like regex engines. That is, when multiple
+ /// patterns would match at the same leftmost position, the pattern that
+ /// appears first in the concrete syntax is chosen.
+ ///
+ /// Currently, the only other kind of match semantics supported is
+ /// [`MatchKind::All`]. This corresponds to "classical DFA" construction
+ /// where all possible matches are visited.
+ ///
+ /// When it comes to the one-pass DFA, it is rarer for preference order and
+ /// "longest match" to actually disagree. Since if they did disagree, then
+ /// the regex typically isn't one-pass. For example, searching `Samwise`
+ /// for `Sam|Samwise` will report `Sam` for leftmost-first matching and
+ /// `Samwise` for "longest match" or "all" matching. However, this regex is
+ /// not one-pass if taken literally. The equivalent regex, `Sam(?:|wise)`
+ /// is one-pass and `Sam|Samwise` may be optimized to it.
+ ///
+ /// The other main difference is that "all" match semantics don't support
+ /// non-greedy matches. "All" match semantics always try to match as much
+ /// as possible.
+ pub fn match_kind(mut self, kind: MatchKind) -> Config {
+ self.match_kind = Some(kind);
+ self
+ }
+
+ /// Whether to compile a separate start state for each pattern in the
+ /// one-pass DFA.
+ ///
+ /// When enabled, a separate **anchored** start state is added for each
+ /// pattern in the DFA. When this start state is used, then the DFA will
+ /// only search for matches for the pattern specified, even if there are
+ /// other patterns in the DFA.
+ ///
+ /// The main downside of this option is that it can potentially increase
+ /// the size of the DFA and/or increase the time it takes to build the DFA.
+ ///
+ /// You might want to enable this option when you want to both search for
+ /// anchored matches of any pattern or to search for anchored matches of
+ /// one particular pattern while using the same DFA. (Otherwise, you would
+ /// need to compile a new DFA for each pattern.)
+ ///
+ /// By default this is disabled.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a multi-regex and then search for
+ /// matches for a any of the patterns or matches for a specific pattern.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::onepass::DFA, Anchored, Input, Match, PatternID,
+ /// };
+ ///
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z]+", "[0-9]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "123abc";
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ ///
+ /// // A normal multi-pattern search will show pattern 1 matches.
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
+ ///
+ /// // If we only want to report pattern 0 matches, then we'll get no
+ /// // match here.
+ /// let input = input.anchored(Anchored::Pattern(PatternID::must(0)));
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
+ self.starts_for_each_pattern = Some(yes);
+ self
+ }
+
+ /// Whether to attempt to shrink the size of the DFA's alphabet or not.
+ ///
+ /// This option is enabled by default and should never be disabled unless
+ /// one is debugging a one-pass DFA.
+ ///
+ /// When enabled, the DFA will use a map from all possible bytes to their
+ /// corresponding equivalence class. Each equivalence class represents a
+ /// set of bytes that does not discriminate between a match and a non-match
+ /// in the DFA. For example, the pattern `[ab]+` has at least two
+ /// equivalence classes: a set containing `a` and `b` and a set containing
+ /// every byte except for `a` and `b`. `a` and `b` are in the same
+ /// equivalence class because they never discriminate between a match and a
+ /// non-match.
+ ///
+ /// The advantage of this map is that the size of the transition table
+ /// can be reduced drastically from (approximately) `#states * 256 *
+ /// sizeof(StateID)` to `#states * k * sizeof(StateID)` where `k` is the
+ /// number of equivalence classes (rounded up to the nearest power of 2).
+ /// As a result, total space usage can decrease substantially. Moreover,
+ /// since a smaller alphabet is used, DFA compilation becomes faster as
+ /// well.
+ ///
+ /// **WARNING:** This is only useful for debugging DFAs. Disabling this
+ /// does not yield any speed advantages. Namely, even when this is
+ /// disabled, a byte class map is still used while searching. The only
+ /// difference is that every byte will be forced into its own distinct
+ /// equivalence class. This is useful for debugging the actual generated
+ /// transitions because it lets one see the transitions defined on actual
+ /// bytes instead of the equivalence classes.
+ pub fn byte_classes(mut self, yes: bool) -> Config {
+ self.byte_classes = Some(yes);
+ self
+ }
+
+ /// Set a size limit on the total heap used by a one-pass DFA.
+ ///
+ /// This size limit is expressed in bytes and is applied during
+ /// construction of a one-pass DFA. If the DFA's heap usage exceeds
+ /// this configured limit, then construction is stopped and an error is
+ /// returned.
+ ///
+ /// The default is no limit.
+ ///
+ /// # Example
+ ///
+ /// This example shows a one-pass DFA that fails to build because of
+ /// a configured size limit. This particular example also serves as a
+ /// cautionary tale demonstrating just how big DFAs with large Unicode
+ /// character classes can get.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// // 6MB isn't enough!
+ /// DFA::builder()
+ /// .configure(DFA::config().size_limit(Some(6_000_000)))
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 7MB probably is!
+ /// // (Note that DFA sizes aren't necessarily stable between releases.)
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().size_limit(Some(7_000_000)))
+ /// .build(r"\w{20}")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "A".repeat(20);
+ /// re.captures(&mut cache, &haystack, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..20)), caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// While one needs a little more than 3MB to represent `\w{20}`, it
+ /// turns out that you only need a little more than 4KB to represent
+ /// `(?-u:\w{20})`. So only use Unicode if you need it!
+ pub fn size_limit(mut self, limit: Option<usize>) -> Config {
+ self.size_limit = Some(limit);
+ self
+ }
+
+ /// Returns the match semantics set in this configuration.
+ pub fn get_match_kind(&self) -> MatchKind {
+ self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+ }
+
+ /// Returns whether this configuration has enabled anchored starting states
+ /// for every pattern in the DFA.
+ pub fn get_starts_for_each_pattern(&self) -> bool {
+ self.starts_for_each_pattern.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration has enabled byte classes or not.
+ /// This is typically a debugging oriented option, as disabling it confers
+ /// no speed benefit.
+ pub fn get_byte_classes(&self) -> bool {
+ self.byte_classes.unwrap_or(true)
+ }
+
+ /// Returns the DFA size limit of this configuration if one was set.
+ /// The size limit is total number of bytes on the heap that a DFA is
+ /// permitted to use. If the DFA exceeds this limit during construction,
+ /// then construction is stopped and an error is returned.
+ pub fn get_size_limit(&self) -> Option<usize> {
+ self.size_limit.unwrap_or(None)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
+ Config {
+ match_kind: o.match_kind.or(self.match_kind),
+ starts_for_each_pattern: o
+ .starts_for_each_pattern
+ .or(self.starts_for_each_pattern),
+ byte_classes: o.byte_classes.or(self.byte_classes),
+ size_limit: o.size_limit.or(self.size_limit),
+ }
+ }
+}
+
+/// A builder for a [one-pass DFA](DFA).
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction and the DFA construction. This builder is different from a
+/// general purpose regex builder in that it permits fine grain configuration
+/// of the construction process. The trade off for this is complexity, and
+/// the possibility of setting a configuration that might not make sense. For
+/// example, there are two different UTF-8 modes:
+///
+/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
+/// whether the pattern itself can contain sub-expressions that match invalid
+/// UTF-8.
+/// * [`thompson::Config::utf8`] controls whether empty matches that split a
+/// Unicode codepoint are reported or not.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax and the NFA.
+/// This is generally what you want for matching on arbitrary bytes.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// dfa::onepass::DFA,
+/// nfa::thompson,
+/// util::syntax,
+/// Match,
+/// };
+///
+/// let re = DFA::builder()
+/// .syntax(syntax::Config::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+///
+/// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n";
+/// re.captures(&mut cache, haystack, &mut caps);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on a one-pass DFA Config,
+/// // since that only impacts regexes that can
+/// // produce matches of length 0.
+/// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler,
+}
+
+impl Builder {
+ /// Create a new one-pass DFA builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ #[cfg(feature = "syntax")]
+ thompson: thompson::Compiler::new(),
+ }
+ }
+
+ /// Build a one-pass DFA from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(feature = "syntax")]
+ pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a one-pass DFA from the given patterns.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ #[cfg(feature = "syntax")]
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<DFA, BuildError> {
+ let nfa =
+ self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
+ self.build_from_nfa(nfa)
+ }
+
+ /// Build a DFA from the given NFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a DFA if you already have an NFA in
+ /// hand.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match};
+ ///
+ /// // This shows how to set non-default options for building an NFA.
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().shrink(true))
+ /// .build(r"[a-z0-9]+")?;
+ /// let re = DFA::builder().build_from_nfa(nfa)?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// re.captures(&mut cache, "foo123bar", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..9)), caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_nfa(&self, nfa: NFA) -> Result<DFA, BuildError> {
+ // Why take ownership if we're just going to pass a reference to the
+ // NFA to our internal builder? Well, the first thing to note is that
+ // an NFA uses reference counting internally, so either choice is going
+ // to be cheap. So there isn't much cost either way.
+ //
+ // The real reason is that a one-pass DFA, semantically, shares
+ // ownership of an NFA. This is unlike other DFAs that don't share
+ // ownership of an NFA at all, primarily because they want to be
+ // self-contained in order to support cheap (de)serialization.
+ //
+ // But then why pass a '&nfa' below if we want to share ownership?
+ // Well, it turns out that using a '&NFA' in our internal builder
+ // separates its lifetime from the DFA we're building, and this turns
+ // out to make code a bit more composable. e.g., We can iterate over
+ // things inside the NFA while borrowing the builder as mutable because
+ // we know the NFA cannot be mutated. So TL;DR --- this weirdness is
+ // "because borrow checker."
+ InternalBuilder::new(self.config.clone(), &nfa).build()
+ }
+
+ /// Apply the given one-pass DFA configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`syntax::Config`](crate::util::syntax::Config).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// These settings only apply when constructing a one-pass DFA directly
+ /// from a pattern.
+ #[cfg(feature = "syntax")]
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::Config,
+ ) -> &mut Builder {
+ self.thompson.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like whether additional time should be
+ /// spent shrinking the size of the NFA.
+ ///
+ /// These settings only apply when constructing a DFA directly from a
+ /// pattern.
+ #[cfg(feature = "syntax")]
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.thompson.configure(config);
+ self
+ }
+}
+
+/// An internal builder for encapsulating the state necessary to build a
+/// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`.
+///
+/// There is no separate pass for determining whether the NFA is one-pass or
+/// not. We just try to build the DFA. If during construction we discover that
+/// it is not one-pass, we bail out. This is likely to lead to some undesirable
+/// expense in some cases, so it might make sense to try an identify common
+/// patterns in the NFA that make it definitively not one-pass. That way, we
+/// can avoid ever trying to build a one-pass DFA in the first place. For
+/// example, '\w*\s' is not one-pass, and since '\w' is Unicode-aware by
+/// default, it's probably not a trivial cost to try and build a one-pass DFA
+/// for it and then fail.
+///
+/// Note that some (immutable) fields are duplicated here. For example, the
+/// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing,
+/// but we duplicate them because it makes composition easier below. Otherwise,
+/// since the borrow checker can't see through method calls, the mutable borrow
+/// we use to mutate the DFA winds up preventing borrowing from any other part
+/// of the DFA, even though we aren't mutating those parts. We only do this
+/// because the duplication is cheap.
+#[derive(Debug)]
+struct InternalBuilder<'a> {
+ /// The DFA we're building.
+ dfa: DFA,
+ /// An unordered collection of NFA state IDs that we haven't yet tried to
+ /// build into a DFA state yet.
+ ///
+ /// This collection does not ultimately wind up including every NFA state
+ /// ID. Instead, each ID represents a "start" state for a sub-graph of the
+ /// NFA. The set of NFA states we then use to build a DFA state consists
+ /// of that "start" state and all states reachable from it via epsilon
+ /// transitions.
+ uncompiled_nfa_ids: Vec<StateID>,
+ /// A map from NFA state ID to DFA state ID. This is useful for easily
+ /// determining whether an NFA state has been used as a "starting" point
+ /// to build a DFA state yet. If it hasn't, then it is mapped to DEAD,
+ /// and since DEAD is specially added and never corresponds to any NFA
+ /// state, it follows that a mapping to DEAD implies the NFA state has
+ /// no corresponding DFA state yet.
+ nfa_to_dfa_id: Vec<StateID>,
+ /// A stack used to traverse the NFA states that make up a single DFA
+ /// state. Traversal occurs until the stack is empty, and we only push to
+ /// the stack when the state ID isn't in 'seen'. Actually, even more than
+ /// that, if we try to push something on to this stack that is already in
+ /// 'seen', then we bail out on construction completely, since it implies
+ /// that the NFA is not one-pass.
+ stack: Vec<(StateID, Epsilons)>,
+ /// The set of NFA states that we've visited via 'stack'.
+ seen: SparseSet,
+ /// Whether a match NFA state has been observed while constructing a
+ /// one-pass DFA state. Once a match state is seen, assuming we are using
+ /// leftmost-first match semantics, then we don't add any more transitions
+ /// to the DFA state we're building.
+ matched: bool,
+ /// The config passed to the builder.
+ ///
+ /// This is duplicated in dfa.config.
+ config: Config,
+ /// The NFA we're building a one-pass DFA from.
+ ///
+ /// This is duplicated in dfa.nfa.
+ nfa: &'a NFA,
+ /// The equivalence classes that make up the alphabet for this DFA>
+ ///
+ /// This is duplicated in dfa.classes.
+ classes: ByteClasses,
+}
+
+impl<'a> InternalBuilder<'a> {
+ /// Create a new builder with an initial empty DFA.
+ fn new(config: Config, nfa: &'a NFA) -> InternalBuilder {
+ let classes = if !config.get_byte_classes() {
+ // A one-pass DFA will always use the equivalence class map, but
+ // enabling this option is useful for debugging. Namely, this will
+ // cause all transitions to be defined over their actual bytes
+ // instead of an opaque equivalence class identifier. The former is
+ // much easier to grok as a human.
+ ByteClasses::singletons()
+ } else {
+ nfa.byte_classes().clone()
+ };
+ // Normally a DFA alphabet includes the EOI symbol, but we don't need
+ // that in the one-pass DFA since we handle look-around explicitly
+ // without encoding it into the DFA. Thus, we don't need to delay
+ // matches by 1 byte. However, we reuse the space that *would* be used
+ // by the EOI transition by putting match information there (like which
+ // pattern matches and which look-around assertions need to hold). So
+ // this means our real alphabet length is 1 fewer than what the byte
+ // classes report, since we don't use EOI.
+ let alphabet_len = classes.alphabet_len().checked_sub(1).unwrap();
+ let stride2 = classes.stride2();
+ let dfa = DFA {
+ config: config.clone(),
+ nfa: nfa.clone(),
+ table: vec![],
+ starts: vec![],
+ // Since one-pass DFAs have a smaller state ID max than
+ // StateID::MAX, it follows that StateID::MAX is a valid initial
+ // value for min_match_id since no state ID can ever be greater
+ // than it. In the case of a one-pass DFA with no match states, the
+ // min_match_id will keep this sentinel value.
+ min_match_id: StateID::MAX,
+ classes: classes.clone(),
+ alphabet_len,
+ stride2,
+ pateps_offset: alphabet_len,
+ // OK because PatternID::MAX*2 is guaranteed not to overflow.
+ explicit_slot_start: nfa.pattern_len().checked_mul(2).unwrap(),
+ };
+ InternalBuilder {
+ dfa,
+ uncompiled_nfa_ids: vec![],
+ nfa_to_dfa_id: vec![DEAD; nfa.states().len()],
+ stack: vec![],
+ seen: SparseSet::new(nfa.states().len()),
+ matched: false,
+ config,
+ nfa,
+ classes,
+ }
+ }
+
+ /// Build the DFA from the NFA given to this builder. If the NFA is not
+ /// one-pass, then return an error. An error may also be returned if a
+ /// particular limit is exceeded. (Some limits, like the total heap memory
+ /// used, are configurable. Others, like the total patterns or slots, are
+ /// hard-coded based on representational limitations.)
+ fn build(mut self) -> Result<DFA, BuildError> {
+ self.nfa.look_set_any().available().map_err(BuildError::word)?;
+ for look in self.nfa.look_set_any().iter() {
+ // This is a future incompatibility check where if we add any
+ // more look-around assertions, then the one-pass DFA either
+ // needs to reject them (what we do here) or it needs to have its
+ // Transition representation modified to be capable of storing the
+ // new assertions.
+ if look.as_repr() > Look::WordUnicodeNegate.as_repr() {
+ return Err(BuildError::unsupported_look(look));
+ }
+ }
+ if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT
+ {
+ return Err(BuildError::too_many_patterns(
+ PatternEpsilons::PATTERN_ID_LIMIT,
+ ));
+ }
+ if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT {
+ return Err(BuildError::not_one_pass(
+ "too many explicit capturing groups (max is 16)",
+ ));
+ }
+ assert_eq!(DEAD, self.add_empty_state()?);
+
+ // This is where the explicit slots start. We care about this because
+ // we only need to track explicit slots. The implicit slots---two for
+ // each pattern---are tracked as part of the search routine itself.
+ let explicit_slot_start = self.nfa.pattern_len() * 2;
+ self.add_start_state(None, self.nfa.start_anchored())?;
+ if self.config.get_starts_for_each_pattern() {
+ for pid in self.nfa.patterns() {
+ self.add_start_state(
+ Some(pid),
+ self.nfa.start_pattern(pid).unwrap(),
+ )?;
+ }
+ }
+ // NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids'
+ // as a stack are. It is really an unordered *set* of NFA state IDs.
+ // If it, for example, in practice led to discovering whether a regex
+ // was or wasn't one-pass later than if we processed NFA state IDs in
+ // ascending order, then that would make this routine more costly in
+ // the somewhat common case of a regex that isn't one-pass.
+ while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() {
+ let dfa_id = self.nfa_to_dfa_id[nfa_id];
+ // Once we see a match, we keep going, but don't add any new
+ // transitions. Normally we'd just stop, but we have to keep
+ // going in order to verify that our regex is actually one-pass.
+ self.matched = false;
+ // The NFA states we've already explored for this DFA state.
+ self.seen.clear();
+ // The NFA states to explore via epsilon transitions. If we ever
+ // try to push an NFA state that we've already seen, then the NFA
+ // is not one-pass because it implies there are multiple epsilon
+ // transition paths that lead to the same NFA state. In other
+ // words, there is ambiguity.
+ self.stack_push(nfa_id, Epsilons::empty())?;
+ while let Some((id, epsilons)) = self.stack.pop() {
+ match *self.nfa.state(id) {
+ thompson::State::ByteRange { ref trans } => {
+ self.compile_transition(dfa_id, trans, epsilons)?;
+ }
+ thompson::State::Sparse(ref sparse) => {
+ for trans in sparse.transitions.iter() {
+ self.compile_transition(dfa_id, trans, epsilons)?;
+ }
+ }
+ thompson::State::Dense(ref dense) => {
+ for trans in dense.iter() {
+ self.compile_transition(dfa_id, &trans, epsilons)?;
+ }
+ }
+ thompson::State::Look { look, next } => {
+ let looks = epsilons.looks().insert(look);
+ self.stack_push(next, epsilons.set_looks(looks))?;
+ }
+ thompson::State::Union { ref alternates } => {
+ for &sid in alternates.iter().rev() {
+ self.stack_push(sid, epsilons)?;
+ }
+ }
+ thompson::State::BinaryUnion { alt1, alt2 } => {
+ self.stack_push(alt2, epsilons)?;
+ self.stack_push(alt1, epsilons)?;
+ }
+ thompson::State::Capture { next, slot, .. } => {
+ let slot = slot.as_usize();
+ let epsilons = if slot < explicit_slot_start {
+ // If this is an implicit slot, we don't care
+ // about it, since we handle implicit slots in
+ // the search routine. We can get away with that
+ // because there are 2 implicit slots for every
+ // pattern.
+ epsilons
+ } else {
+ // Offset our explicit slots so that they start
+ // at index 0.
+ let offset = slot - explicit_slot_start;
+ epsilons.set_slots(epsilons.slots().insert(offset))
+ };
+ self.stack_push(next, epsilons)?;
+ }
+ thompson::State::Fail => {
+ continue;
+ }
+ thompson::State::Match { pattern_id } => {
+ // If we found two different paths to a match state
+ // for the same DFA state, then we have ambiguity.
+ // Thus, it's not one-pass.
+ if self.matched {
+ return Err(BuildError::not_one_pass(
+ "multiple epsilon transitions to match state",
+ ));
+ }
+ self.matched = true;
+ // Shove the matching pattern ID and the 'epsilons'
+ // into the current DFA state's pattern epsilons. The
+ // 'epsilons' includes the slots we need to capture
+ // before reporting the match and also the conditional
+ // epsilon transitions we need to check before we can
+ // report a match.
+ self.dfa.set_pattern_epsilons(
+ dfa_id,
+ PatternEpsilons::empty()
+ .set_pattern_id(pattern_id)
+ .set_epsilons(epsilons),
+ );
+ // N.B. It is tempting to just bail out here when
+ // compiling a leftmost-first DFA, since we will never
+ // compile any more transitions in that case. But we
+ // actually need to keep going in order to verify that
+ // we actually have a one-pass regex. e.g., We might
+ // see more Match states (e.g., for other patterns)
+ // that imply that we don't have a one-pass regex.
+ // So instead, we mark that we've found a match and
+ // continue on. When we go to compile a new DFA state,
+ // we just skip that part. But otherwise check that the
+ // one-pass property is upheld.
+ }
+ }
+ }
+ }
+ self.shuffle_states();
+ Ok(self.dfa)
+ }
+
+ /// Shuffle all match states to the end of the transition table and set
+ /// 'min_match_id' to the ID of the first such match state.
+ ///
+ /// The point of this is to make it extremely cheap to determine whether
+ /// a state is a match state or not. We need to check on this on every
+ /// transition during a search, so it being cheap is important. This
+ /// permits us to check it by simply comparing two state identifiers, as
+ /// opposed to looking for the pattern ID in the state's `PatternEpsilons`.
+ /// (Which requires a memory load and some light arithmetic.)
+ fn shuffle_states(&mut self) {
+ let mut remapper = Remapper::new(&self.dfa);
+ let mut next_dest = self.dfa.last_state_id();
+ for i in (0..self.dfa.state_len()).rev() {
+ let id = StateID::must(i);
+ let is_match =
+ self.dfa.pattern_epsilons(id).pattern_id().is_some();
+ if !is_match {
+ continue;
+ }
+ remapper.swap(&mut self.dfa, next_dest, id);
+ self.dfa.min_match_id = next_dest;
+ next_dest = self.dfa.prev_state_id(next_dest).expect(
+ "match states should be a proper subset of all states",
+ );
+ }
+ remapper.remap(&mut self.dfa);
+ }
+
+ /// Compile the given NFA transition into the DFA state given.
+ ///
+ /// 'Epsilons' corresponds to any conditional epsilon transitions that need
+ /// to be satisfied to follow this transition, and any slots that need to
+ /// be saved if the transition is followed.
+ ///
+ /// If this transition indicates that the NFA is not one-pass, then
+ /// this returns an error. (This occurs, for example, if the DFA state
+ /// already has a transition defined for the same input symbols as the
+ /// given transition, *and* the result of the old and new transitions is
+ /// different.)
+ fn compile_transition(
+ &mut self,
+ dfa_id: StateID,
+ trans: &thompson::Transition,
+ epsilons: Epsilons,
+ ) -> Result<(), BuildError> {
+ let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?;
+ for byte in self
+ .classes
+ .representatives(trans.start..=trans.end)
+ .filter_map(|r| r.as_u8())
+ {
+ let oldtrans = self.dfa.transition(dfa_id, byte);
+ let newtrans =
+ Transition::new(self.matched, next_dfa_id, epsilons);
+ // If the old transition points to the DEAD state, then we know
+ // 'byte' has not been mapped to any transition for this DFA state
+ // yet. So set it unconditionally. Otherwise, we require that the
+ // old and new transitions are equivalent. Otherwise, there is
+ // ambiguity and thus the regex is not one-pass.
+ if oldtrans.state_id() == DEAD {
+ self.dfa.set_transition(dfa_id, byte, newtrans);
+ } else if oldtrans != newtrans {
+ return Err(BuildError::not_one_pass(
+ "conflicting transition",
+ ));
+ }
+ }
+ Ok(())
+ }
+
+ /// Add a start state to the DFA corresponding to the given NFA starting
+ /// state ID.
+ ///
+ /// If adding a state would blow any limits (configured or hard-coded),
+ /// then an error is returned.
+ ///
+ /// If the starting state is an anchored state for a particular pattern,
+ /// then callers must provide the pattern ID for that starting state.
+ /// Callers must also ensure that the first starting state added is the
+ /// start state for all patterns, and then each anchored starting state for
+ /// each pattern (if necessary) added in order. Otherwise, this panics.
+ fn add_start_state(
+ &mut self,
+ pid: Option<PatternID>,
+ nfa_id: StateID,
+ ) -> Result<StateID, BuildError> {
+ match pid {
+ // With no pid, this should be the start state for all patterns
+ // and thus be the first one.
+ None => assert!(self.dfa.starts.is_empty()),
+ // With a pid, we want it to be at self.dfa.starts[pid+1].
+ Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()),
+ }
+ let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?;
+ self.dfa.starts.push(dfa_id);
+ Ok(dfa_id)
+ }
+
+ /// Add a new DFA state corresponding to the given NFA state. If adding a
+ /// state would blow any limits (configured or hard-coded), then an error
+ /// is returned. If a DFA state already exists for the given NFA state,
+ /// then that DFA state's ID is returned and no new states are added.
+ ///
+ /// It is not expected that this routine is called for every NFA state.
+ /// Instead, an NFA state ID will usually correspond to the "start" state
+ /// for a sub-graph of the NFA, where all states in the sub-graph are
+ /// reachable via epsilon transitions (conditional or unconditional). That
+ /// sub-graph of NFA states is ultimately what produces a single DFA state.
+ fn add_dfa_state_for_nfa_state(
+ &mut self,
+ nfa_id: StateID,
+ ) -> Result<StateID, BuildError> {
+ // If we've already built a DFA state for the given NFA state, then
+ // just return that. We definitely do not want to have more than one
+ // DFA state in existence for the same NFA state, since all but one of
+ // them will likely become unreachable. And at least some of them are
+ // likely to wind up being incomplete.
+ let existing_dfa_id = self.nfa_to_dfa_id[nfa_id];
+ if existing_dfa_id != DEAD {
+ return Ok(existing_dfa_id);
+ }
+ // If we don't have any DFA state yet, add it and then add the given
+ // NFA state to the list of states to explore.
+ let dfa_id = self.add_empty_state()?;
+ self.nfa_to_dfa_id[nfa_id] = dfa_id;
+ self.uncompiled_nfa_ids.push(nfa_id);
+ Ok(dfa_id)
+ }
+
+ /// Unconditionally add a new empty DFA state. If adding it would exceed
+ /// any limits (configured or hard-coded), then an error is returned. The
+ /// ID of the new state is returned on success.
+ ///
+ /// The added state is *not* a match state.
+ fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
+ let state_limit = Transition::STATE_ID_LIMIT;
+ // Note that unlike dense and lazy DFAs, we specifically do NOT
+ // premultiply our state IDs here. The reason is that we want to pack
+ // our state IDs into 64-bit transitions with other info, so the fewer
+ // the bits we use for state IDs the better. If we premultiply, then
+ // our state ID space shrinks. We justify this by the assumption that
+ // a one-pass DFA is just already doing a fair bit more work than a
+ // normal DFA anyway, so an extra multiplication to compute a state
+ // transition doesn't seem like a huge deal.
+ let next_id = self.dfa.table.len() >> self.dfa.stride2();
+ let id = StateID::new(next_id)
+ .map_err(|_| BuildError::too_many_states(state_limit))?;
+ if id.as_u64() > Transition::STATE_ID_LIMIT {
+ return Err(BuildError::too_many_states(state_limit));
+ }
+ self.dfa
+ .table
+ .extend(core::iter::repeat(Transition(0)).take(self.dfa.stride()));
+ // The default empty value for 'PatternEpsilons' is sadly not all
+ // zeroes. Instead, a special sentinel is used to indicate that there
+ // is no pattern. So we need to explicitly set the pattern epsilons to
+ // the correct "empty" PatternEpsilons.
+ self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty());
+ if let Some(size_limit) = self.config.get_size_limit() {
+ if self.dfa.memory_usage() > size_limit {
+ return Err(BuildError::exceeded_size_limit(size_limit));
+ }
+ }
+ Ok(id)
+ }
+
+ /// Push the given NFA state ID and its corresponding epsilons (slots and
+ /// conditional epsilon transitions) on to a stack for use in a depth first
+ /// traversal of a sub-graph of the NFA.
+ ///
+ /// If the given NFA state ID has already been pushed on to the stack, then
+ /// it indicates the regex is not one-pass and this correspondingly returns
+ /// an error.
+ fn stack_push(
+ &mut self,
+ nfa_id: StateID,
+ epsilons: Epsilons,
+ ) -> Result<(), BuildError> {
+ // If we already have seen a match and we are compiling a leftmost
+ // first DFA, then we shouldn't add any more states to look at. This is
+ // effectively how preference order and non-greediness is implemented.
+ // if !self.config.get_match_kind().continue_past_first_match()
+ // && self.matched
+ // {
+ // return Ok(());
+ // }
+ if !self.seen.insert(nfa_id) {
+ return Err(BuildError::not_one_pass(
+ "multiple epsilon transitions to same state",
+ ));
+ }
+ self.stack.push((nfa_id, epsilons));
+ Ok(())
+ }
+}
+
+/// A one-pass DFA for executing a subset of anchored regex searches while
+/// resolving capturing groups.
+///
+/// A one-pass DFA can be built from an NFA that is one-pass. An NFA is
+/// one-pass when there is never any ambiguity about how to continue a search.
+/// For example, `a*a` is not one-pass becuase during a search, it's not
+/// possible to know whether to continue matching the `a*` or to move on to
+/// the single `a`. However, `a*b` is one-pass, because for every byte in the
+/// input, it's always clear when to move on from `a*` to `b`.
+///
+/// # Only anchored searches are supported
+///
+/// In this crate, especially for DFAs, unanchored searches are implemented by
+/// treating the pattern as if it had a `(?s-u:.)*?` prefix. While the prefix
+/// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)*?a` will
+/// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches
+/// any byte, and there is therefore ambiguity as to when the prefix should
+/// stop matching and something else should start matching.
+///
+/// Therefore, one-pass DFAs do not support unanchored searches. In addition
+/// to many regexes simply not being one-pass, it implies that one-pass DFAs
+/// have limited utility. With that said, when a one-pass DFA can be used, it
+/// can potentially provide a dramatic speed up over alternatives like the
+/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
+/// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular,
+/// a one-pass DFA is the only DFA capable of reporting the spans of matching
+/// capturing groups.
+///
+/// To clarify, when we say that unanchored searches are not supported, what
+/// that actually means is:
+///
+/// * The high level routines, [`DFA::is_match`] and [`DFA::captures`], always
+/// do anchored searches.
+/// * Since iterators are most useful in the context of unanchored searches,
+/// there is no `DFA::captures_iter` method.
+/// * For lower level routines like [`DFA::try_search`], an error will be
+/// returned if the given [`Input`] is configured to do an unanchored search or
+/// search for an invalid pattern ID. (Note that an [`Input`] is configured to
+/// do an unanchored search by default, so just giving a `Input::new` is
+/// guaranteed to return an error.)
+///
+/// # Other limitations
+///
+/// In addition to the [configurable heap limit](Config::size_limit) and
+/// the requirement that a regex pattern be one-pass, there are some other
+/// limitations:
+///
+/// * There is an internal limit on the total number of explicit capturing
+/// groups that appear across all patterns. It is somewhat small and there is
+/// no way to configure it. If your pattern(s) exceed this limit, then building
+/// a one-pass DFA will fail.
+/// * If the number of patterns exceeds an internal unconfigurable limit, then
+/// building a one-pass DFA will fail. This limit is quite large and you're
+/// unlikely to hit it.
+/// * If the total number of states exceeds an internal unconfigurable limit,
+/// then building a one-pass DFA will fail. This limit is quite large and
+/// you're unlikely to hit it.
+///
+/// # Other examples of regexes that aren't one-pass
+///
+/// One particularly unfortunate example is that enabling Unicode can cause
+/// regexes that were one-pass to no longer be one-pass. Consider the regex
+/// `(?-u)\w*\s` for example. It is one-pass because there is exactly no
+/// overlap between the ASCII definitions of `\w` and `\s`. But `\w*\s`
+/// (i.e., with Unicode enabled) is *not* one-pass because `\w` and `\s` get
+/// translated to UTF-8 automatons. And while the *codepoints* in `\w` and `\s`
+/// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the
+/// overlap between UTF-8 automata, the use of Unicode character classes will
+/// tend to vastly increase the likelihood of a regex not being one-pass.
+///
+/// # How does one know if a regex is one-pass or not?
+///
+/// At the time of writing, the only way to know is to try and build a one-pass
+/// DFA. The one-pass property is checked while constructing the DFA.
+///
+/// This does mean that you might potentially waste some CPU cycles and memory
+/// by optimistically trying to build a one-pass DFA. But this is currently the
+/// only way. In the future, building a one-pass DFA might be able to use some
+/// heuristics to detect common violations of the one-pass property and bail
+/// more quickly.
+///
+/// # Resource usage
+///
+/// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource
+/// usage. Namely, construction of a one-pass DFA has a time and space
+/// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time
+/// and space complexity is `O(2^n)`.) This smaller time bound is achieved
+/// because there is at most one DFA state created for each NFA state. If
+/// additional DFA states would be required, then the pattern is not one-pass
+/// and construction will fail.
+///
+/// Note though that currently, this DFA uses a fully dense representation.
+/// This means that while its space complexity is no worse than an NFA, it may
+/// in practice use more memory because of higher constant factors. The reason
+/// for this trade off is two-fold. Firstly, a dense representation makes the
+/// search faster. Secondly, the bigger an NFA, the more unlikely it is to be
+/// one-pass. Therefore, most one-pass DFAs are usually pretty small.
+///
+/// # Example
+///
+/// This example shows that the one-pass DFA implements Unicode word boundaries
+/// correctly while simultaneously reporting spans for capturing groups that
+/// participate in a match. (This is the only DFA that implements full support
+/// for Unicode word boundaries.)
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{dfa::onepass::DFA, Match, Span};
+///
+/// let re = DFA::new(r"\b(?P<first>\w+)[[:space:]]+(?P<last>\w+)\b")?;
+/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+///
+/// re.captures(&mut cache, "Шерлок Холмс", &mut caps);
+/// assert_eq!(Some(Match::must(0, 0..23)), caps.get_match());
+/// assert_eq!(Some(Span::from(0..12)), caps.get_group_by_name("first"));
+/// assert_eq!(Some(Span::from(13..23)), caps.get_group_by_name("last"));
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: iteration
+///
+/// Unlike other regex engines in this crate, this one does not provide
+/// iterator search functions. This is because a one-pass DFA only supports
+/// anchored searches, and so iterator functions are generally not applicable.
+///
+/// However, if you know that all of your matches are
+/// directly adjacent, then an iterator can be used. The
+/// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for
+/// this purpose:
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// dfa::onepass::DFA,
+/// util::iter::Searcher,
+/// Anchored, Input, Span,
+/// };
+///
+/// let re = DFA::new(r"\w(\d)\w")?;
+/// let (mut cache, caps) = (re.create_cache(), re.create_captures());
+/// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes);
+///
+/// let mut it = Searcher::new(input).into_captures_iter(caps, |input, caps| {
+/// Ok(re.try_search(&mut cache, input, caps)?)
+/// }).infallible();
+/// let caps0 = it.next().unwrap();
+/// assert_eq!(Some(Span::from(1..2)), caps0.get_group(1));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct DFA {
+ /// The configuration provided by the caller.
+ config: Config,
+ /// The NFA used to build this DFA.
+ ///
+ /// NOTE: We probably don't need to store the NFA here, but we use enough
+ /// bits from it that it's convenient to do so. And there really isn't much
+ /// cost to doing so either, since an NFA is reference counted internally.
+ nfa: NFA,
+ /// The transition table. Given a state ID 's' and a byte of haystack 'b',
+ /// the next state is `table[sid + classes[byte]]`.
+ ///
+ /// The stride of this table (i.e., the number of columns) is always
+ /// a power of 2, even if the alphabet length is smaller. This makes
+ /// converting between state IDs and state indices very cheap.
+ ///
+ /// Note that the stride always includes room for one extra "transition"
+ /// that isn't actually a transition. It is a 'PatternEpsilons' that is
+ /// used for match states only. Because of this, the maximum number of
+ /// active columns in the transition table is 257, which means the maximum
+ /// stride is 512 (the next power of 2 greater than or equal to 257).
+ table: Vec<Transition>,
+ /// The DFA state IDs of the starting states.
+ ///
+ /// `starts[0]` is always present and corresponds to the starting state
+ /// when searching for matches of any pattern in the DFA.
+ ///
+ /// `starts[i]` where i>0 corresponds to the starting state for the pattern
+ /// ID 'i-1'. These starting states are optional.
+ starts: Vec<StateID>,
+ /// Every state ID >= this value corresponds to a match state.
+ ///
+ /// This is what a search uses to detect whether a state is a match state
+ /// or not. It requires only a simple comparison instead of bit-unpacking
+ /// the PatternEpsilons from every state.
+ min_match_id: StateID,
+ /// The alphabet of this DFA, split into equivalence classes. Bytes in the
+ /// same equivalence class can never discriminate between a match and a
+ /// non-match.
+ classes: ByteClasses,
+ /// The number of elements in each state in the transition table. This may
+ /// be less than the stride, since the stride is always a power of 2 and
+ /// the alphabet length can be anything up to and including 256.
+ alphabet_len: usize,
+ /// The number of columns in the transition table, expressed as a power of
+ /// 2.
+ stride2: usize,
+ /// The offset at which the PatternEpsilons for a match state is stored in
+ /// the transition table.
+ ///
+ /// PERF: One wonders whether it would be better to put this in a separate
+ /// allocation, since only match states have a non-empty PatternEpsilons
+ /// and the number of match states tends be dwarfed by the number of
+ /// non-match states. So this would save '8*len(non_match_states)' for each
+ /// DFA. The question is whether moving this to a different allocation will
+ /// lead to a perf hit during searches. You might think dealing with match
+ /// states is rare, but some regexes spend a lot of time in match states
+ /// gobbling up input. But... match state handling is already somewhat
+ /// expensive, so maybe this wouldn't do much? Either way, it's worth
+ /// experimenting.
+ pateps_offset: usize,
+ /// The first explicit slot index. This refers to the first slot appearing
+ /// immediately after the last implicit slot. It is always 'patterns.len()
+ /// * 2'.
+ ///
+ /// We record this because we only store the explicit slots in our DFA
+ /// transition table that need to be saved. Implicit slots are handled
+ /// automatically as part of the search.
+ explicit_slot_start: usize,
+}
+
+impl DFA {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding one-pass DFA.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re = DFA::new("foo[0-9]+bar")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "foo12345barzzz", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..11)), caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ #[inline]
+ pub fn new(pattern: &str) -> Result<DFA, BuildError> {
+ DFA::builder().build(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "multi regex."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "abc123", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..3)), caps.get_match());
+ ///
+ /// re.captures(&mut cache, "123abc", &mut caps);
+ /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ #[inline]
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
+ DFA::builder().build_many(patterns)
+ }
+
+ /// Like `new`, but builds a one-pass DFA directly from an NFA. This is
+ /// useful if you already have an NFA, or even if you hand-assembled the
+ /// NFA.
+ ///
+ /// # Example
+ ///
+ /// This shows how to hand assemble a regular expression via its HIR,
+ /// compile an NFA from it and build a one-pass DFA from the NFA.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::onepass::DFA,
+ /// nfa::thompson::NFA,
+ /// Match,
+ /// };
+ /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'0', b'9'),
+ /// ClassBytesRange::new(b'A', b'Z'),
+ /// ClassBytesRange::new(b'_', b'_'),
+ /// ClassBytesRange::new(b'a', b'z'),
+ /// ])));
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
+ ///
+ /// let re = DFA::new_from_nfa(nfa)?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let expected = Some(Match::must(0, 0..1));
+ /// re.captures(&mut cache, "A", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_from_nfa(nfa: NFA) -> Result<DFA, BuildError> {
+ DFA::builder().build_from_nfa(nfa)
+ }
+
+ /// Create a new one-pass DFA that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let dfa = DFA::always_match()?;
+ /// let mut cache = dfa.create_cache();
+ /// let mut caps = dfa.create_captures();
+ ///
+ /// let expected = Match::must(0, 0..0);
+ /// dfa.captures(&mut cache, "", &mut caps);
+ /// assert_eq!(Some(expected), caps.get_match());
+ /// dfa.captures(&mut cache, "foo", &mut caps);
+ /// assert_eq!(Some(expected), caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<DFA, BuildError> {
+ let nfa = thompson::NFA::always_match();
+ Builder::new().build_from_nfa(nfa)
+ }
+
+ /// Create a new one-pass DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::onepass::DFA;
+ ///
+ /// let dfa = DFA::never_match()?;
+ /// let mut cache = dfa.create_cache();
+ /// let mut caps = dfa.create_captures();
+ ///
+ /// dfa.captures(&mut cache, "", &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ /// dfa.captures(&mut cache, "foo", &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<DFA, BuildError> {
+ let nfa = thompson::NFA::never_match();
+ Builder::new().build_from_nfa(nfa)
+ }
+
+ /// Return a default configuration for a DFA.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a DFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to change the match semantics of this DFA from
+ /// its default "leftmost first" to "all." When using "all," non-greediness
+ /// doesn't apply and neither does preference order matching. Instead, the
+ /// longest match possible is always returned. (Although, by construction,
+ /// it's impossible for a one-pass DFA to have a different answer for
+ /// "preference order" vs "longest match.")
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match, MatchKind};
+ ///
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build(r"(abc)+?")?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ ///
+ /// re.captures(&mut cache, "abcabc", &mut caps);
+ /// // Normally, the non-greedy repetition would give us a 0..3 match.
+ /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a DFA.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::onepass::DFA,
+ /// nfa::thompson,
+ /// util::syntax,
+ /// Match,
+ /// };
+ ///
+ /// let re = DFA::builder()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(Match::must(0, 0..8));
+ /// re.captures(&mut cache, haystack, &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Create a new empty set of capturing groups that is guaranteed to be
+ /// valid for the search APIs on this DFA.
+ ///
+ /// A `Captures` value created for a specific DFA cannot be used with any
+ /// other DFA.
+ ///
+ /// This is a convenience function for [`Captures::all`]. See the
+ /// [`Captures`] documentation for an explanation of its alternative
+ /// constructors that permit the DFA to do less work during a search, and
+ /// thus might make it faster.
+ #[inline]
+ pub fn create_captures(&self) -> Captures {
+ Captures::all(self.nfa.group_info().clone())
+ }
+
+ /// Create a new cache for this DFA.
+ ///
+ /// The cache returned should only be used for searches for this
+ /// DFA. If you want to reuse the cache for another DFA, then you
+ /// must call [`Cache::reset`] with that DFA (or, equivalently,
+ /// [`DFA::reset_cache`]).
+ #[inline]
+ pub fn create_cache(&self) -> Cache {
+ Cache::new(self)
+ }
+
+ /// Reset the given cache such that it can be used for searching with the
+ /// this DFA (and only this DFA).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different DFA.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different DFA.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re1 = DFA::new(r"\w")?;
+ /// let re2 = DFA::new(r"\W")?;
+ /// let mut caps1 = re1.create_captures();
+ /// let mut caps2 = re2.create_captures();
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..2)),
+ /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the one-pass DFA we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// re2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..3)),
+ /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn reset_cache(&self, cache: &mut Cache) {
+ cache.reset(self);
+ }
+
+ /// Return the config for this one-pass DFA.
+ #[inline]
+ pub fn get_config(&self) -> &Config {
+ &self.config
+ }
+
+ /// Returns a reference to the underlying NFA.
+ #[inline]
+ pub fn get_nfa(&self) -> &NFA {
+ &self.nfa
+ }
+
+ /// Returns the total number of patterns compiled into this DFA.
+ ///
+ /// In the case of a DFA that contains no patterns, this returns `0`.
+ #[inline]
+ pub fn pattern_len(&self) -> usize {
+ self.get_nfa().pattern_len()
+ }
+
+ /// Returns the total number of states in this one-pass DFA.
+ ///
+ /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
+ /// a low level DFA API. Therefore, this routine has little use other than
+ /// being informational.
+ #[inline]
+ pub fn state_len(&self) -> usize {
+ self.table.len() >> self.stride2()
+ }
+
+ /// Returns the total number of elements in the alphabet for this DFA.
+ ///
+ /// That is, this returns the total number of transitions that each
+ /// state in this DFA must have. The maximum alphabet size is 256, which
+ /// corresponds to each possible byte value.
+ ///
+ /// The alphabet size may be less than 256 though, and unless
+ /// [`Config::byte_classes`] is disabled, it is typically must less than
+ /// 256. Namely, bytes are grouped into equivalence classes such that no
+ /// two bytes in the same class can distinguish a match from a non-match.
+ /// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could
+ /// all be in the same equivalence class. This leads to a massive space
+ /// savings.
+ ///
+ /// Note though that the alphabet length does _not_ necessarily equal the
+ /// total stride space taken up by a single DFA state in the transition
+ /// table. Namely, for performance reasons, the stride is always the
+ /// smallest power of two that is greater than or equal to the alphabet
+ /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
+ /// often more useful. The alphabet length is typically useful only for
+ /// informational purposes.
+ ///
+ /// Note also that unlike dense or sparse DFAs, a one-pass DFA does
+ /// not have a special end-of-input (EOI) transition. This is because
+ /// a one-pass DFA handles look-around assertions explicitly (like the
+ /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build
+ /// them into the transitions of the DFA.
+ #[inline]
+ pub fn alphabet_len(&self) -> usize {
+ self.alphabet_len
+ }
+
+ /// Returns the total stride for every state in this DFA, expressed as the
+ /// exponent of a power of 2. The stride is the amount of space each state
+ /// takes up in the transition table, expressed as a number of transitions.
+ /// (Unused transitions map to dead states.)
+ ///
+ /// The stride of a DFA is always equivalent to the smallest power of
+ /// 2 that is greater than or equal to the DFA's alphabet length. This
+ /// definition uses extra space, but possibly permits faster translation
+ /// between state identifiers and their corresponding offsets in this DFA's
+ /// transition table.
+ ///
+ /// For example, if the DFA's stride is 16 transitions, then its `stride2`
+ /// is `4` since `2^4 = 16`.
+ ///
+ /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
+ /// while the maximum `stride2` value is `9` (corresponding to a stride
+ /// of `512`). The maximum in theory should be `8`, but because of some
+ /// implementation quirks that may be relaxed in the future, it is one more
+ /// than `8`. (Do note that a maximal stride is incredibly rare, as it
+ /// would imply that there is almost no redundant in the regex pattern.)
+ ///
+ /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
+ /// a low level DFA API. Therefore, this routine has little use other than
+ /// being informational.
+ #[inline]
+ pub fn stride2(&self) -> usize {
+ self.stride2
+ }
+
+ /// Returns the total stride for every state in this DFA. This corresponds
+ /// to the total number of transitions used by each state in this DFA's
+ /// transition table.
+ ///
+ /// Please see [`DFA::stride2`] for more information. In particular, this
+ /// returns the stride as the number of transitions, where as `stride2`
+ /// returns it as the exponent of a power of 2.
+ ///
+ /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose
+ /// a low level DFA API. Therefore, this routine has little use other than
+ /// being informational.
+ #[inline]
+ pub fn stride(&self) -> usize {
+ 1 << self.stride2()
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, use `std::mem::size_of::<onepass::DFA>()`.
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ self.table.len() * size_of::<Transition>()
+ + self.starts.len() * size_of::<StateID>()
+ }
+}
+
+impl DFA {
+ /// Executes an anchored leftmost forward search, and returns true if and
+ /// only if this one-pass DFA matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future
+ /// input will never lead to a different result. In particular, if the
+ /// underlying DFA enters a match state, then this routine will return
+ /// `true` immediately without inspecting any future input. (Consider how
+ /// this might make a difference given the regex `a+` on the haystack
+ /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
+ /// but routines like `find` need to continue searching because `+` is
+ /// greedy by default.)
+ ///
+ /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
+ /// given configuration was [`Anchored::No`] (which is the default).
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`DFA::try_search`] if you want to handle these panics as error
+ /// values instead.
+ ///
+ /// # Example
+ ///
+ /// This shows basic usage:
+ ///
+ /// ```
+ /// use regex_automata::dfa::onepass::DFA;
+ ///
+ /// let re = DFA::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, "foo12345bar"));
+ /// assert!(!re.is_match(&mut cache, "foobar"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: consistency with search APIs
+ ///
+ /// `is_match` is guaranteed to return `true` whenever `captures` returns
+ /// a match. This includes searches that are executed entirely within a
+ /// codepoint:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Input};
+ ///
+ /// let re = DFA::new("a*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Notice that when UTF-8 mode is disabled, then the above reports a
+ /// match because the restriction against zero-width matches that split a
+ /// codepoint has been lifted:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input};
+ ///
+ /// let re = DFA::builder()
+ /// .thompson(NFA::config().utf8(false))
+ /// .build("a*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ ) -> bool {
+ let mut input = input.into().earliest(true);
+ if matches!(input.get_anchored(), Anchored::No) {
+ input.set_anchored(Anchored::Yes);
+ }
+ self.try_search_slots(cache, &input, &mut []).unwrap().is_some()
+ }
+
+ /// Executes an anchored leftmost forward search, and returns a `Match` if
+ /// and only if this one-pass DFA matches the given haystack.
+ ///
+ /// This routine only includes the overall match span. To get access to the
+ /// individual spans of each capturing group, use [`DFA::captures`].
+ ///
+ /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
+ /// given configuration was [`Anchored::No`] (which is the default).
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`DFA::try_search`] if you want to handle these panics as error
+ /// values instead.
+ ///
+ /// # Example
+ ///
+ /// Leftmost first match semantics corresponds to the match with the
+ /// smallest starting offset, but where the end offset is determined by
+ /// preferring earlier branches in the original regular expression. For
+ /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+ /// will match `Samwise` in `Samwise`.
+ ///
+ /// Generally speaking, the "leftmost first" match is how most backtracking
+ /// regular expressions tend to work. This is in contrast to POSIX-style
+ /// regular expressions that yield "leftmost longest" matches. Namely,
+ /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+ /// leftmost longest semantics. (This crate does not currently support
+ /// leftmost longest semantics.)
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re = DFA::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// let expected = Match::must(0, 0..8);
+ /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over later parts.
+ /// let re = DFA::new("abc|a")?;
+ /// let mut cache = re.create_cache();
+ /// let expected = Match::must(0, 0..3);
+ /// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ ) -> Option<Match> {
+ let mut input = input.into();
+ if matches!(input.get_anchored(), Anchored::No) {
+ input.set_anchored(Anchored::Yes);
+ }
+ if self.get_nfa().pattern_len() == 1 {
+ let mut slots = [None, None];
+ let pid =
+ self.try_search_slots(cache, &input, &mut slots).unwrap()?;
+ let start = slots[0].unwrap().get();
+ let end = slots[1].unwrap().get();
+ return Some(Match::new(pid, Span { start, end }));
+ }
+ let ginfo = self.get_nfa().group_info();
+ let slots_len = ginfo.implicit_slot_len();
+ let mut slots = vec![None; slots_len];
+ let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?;
+ let start = slots[pid.as_usize() * 2].unwrap().get();
+ let end = slots[pid.as_usize() * 2 + 1].unwrap().get();
+ Some(Match::new(pid, Span { start, end }))
+ }
+
+ /// Executes an anchored leftmost forward search and writes the spans
+ /// of capturing groups that participated in a match into the provided
+ /// [`Captures`] value. If no match was found, then [`Captures::is_match`]
+ /// is guaranteed to return `false`.
+ ///
+ /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the
+ /// given configuration was [`Anchored::No`] (which is the default).
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`DFA::try_search`] if you want to handle these panics as error
+ /// values instead.
+ ///
+ /// # Example
+ ///
+ /// This shows a simple example of a one-pass regex that extracts
+ /// capturing group spans.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Match, Span};
+ ///
+ /// let re = DFA::new(
+ /// // Notice that we use ASCII here. The corresponding Unicode regex
+ /// // is sadly not one-pass.
+ /// "(?P<first>[[:alpha:]]+)[[:space:]]+(?P<last>[[:alpha:]]+)",
+ /// )?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, "Bruce Springsteen", &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match());
+ /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1));
+ /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn captures<'h, I: Into<Input<'h>>>(
+ &self,
+ cache: &mut Cache,
+ input: I,
+ caps: &mut Captures,
+ ) {
+ let mut input = input.into();
+ if matches!(input.get_anchored(), Anchored::No) {
+ input.set_anchored(Anchored::Yes);
+ }
+ self.try_search(cache, &input, caps).unwrap();
+ }
+
+ /// Executes an anchored leftmost forward search and writes the spans
+ /// of capturing groups that participated in a match into the provided
+ /// [`Captures`] value. If no match was found, then [`Captures::is_match`]
+ /// is guaranteed to return `false`.
+ ///
+ /// The differences with [`DFA::captures`] are:
+ ///
+ /// 1. This returns an error instead of panicking if the search fails.
+ /// 2. Accepts an `&Input` instead of a `Into<Input>`. This permits reusing
+ /// the same input for multiple searches, which _may_ be important for
+ /// latency.
+ /// 3. This does not automatically change the [`Anchored`] mode from `No`
+ /// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an
+ /// error is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a multi-regex that permits searching
+ /// for specific patterns. Note that this is somewhat less useful than
+ /// in other regex engines, since a one-pass DFA by definition has no
+ /// ambiguity about which pattern can match at a position. That is, if it
+ /// were possible for two different patterns to match at the same starting
+ /// position, then the multi-regex would not be one-pass and construction
+ /// would have failed.
+ ///
+ /// Nevertheless, this can still be useful if you only care about matches
+ /// for a specific pattern, and want the DFA to report "no match" even if
+ /// some other pattern would have matched.
+ ///
+ /// Note that in order to make use of this functionality,
+ /// [`Config::starts_for_each_pattern`] must be enabled. It is disabled
+ /// by default since it may result in higher memory usage.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::onepass::DFA, Anchored, Input, Match, PatternID,
+ /// };
+ ///
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z]+", "[0-9]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "123abc";
+ /// let input = Input::new(haystack).anchored(Anchored::Yes);
+ ///
+ /// // A normal multi-pattern search will show pattern 1 matches.
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match());
+ ///
+ /// // If we only want to report pattern 0 matches, then we'll get no
+ /// // match here.
+ /// let input = input.anchored(Anchored::Pattern(PatternID::must(0)));
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match};
+ ///
+ /// // one-pass DFAs fully support Unicode word boundaries!
+ /// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass.
+ /// // :-(
+ /// let re = DFA::new(r"\b[0-9]{3}\b")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ /// let haystack = "foo123bar";
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about
+ /// // the larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `0..3` instead of
+ /// // `3..6`.
+ /// let expected = Some(Match::must(0, 0..3));
+ /// let input = Input::new(&haystack[3..6]).anchored(Anchored::Yes);
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let input = Input::new(haystack).range(3..6).anchored(Anchored::Yes);
+ /// re.try_search(&mut cache, &input, &mut caps)?;
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_search(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ caps: &mut Captures,
+ ) -> Result<(), MatchError> {
+ let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
+ caps.set_pattern(pid);
+ Ok(())
+ }
+
+ /// Executes an anchored leftmost forward search and writes the spans
+ /// of capturing groups that participated in a match into the provided
+ /// `slots`, and returns the matching pattern ID. The contents of the
+ /// slots for patterns other than the matching pattern are unspecified. If
+ /// no match was found, then `None` is returned and the contents of all
+ /// `slots` is unspecified.
+ ///
+ /// This is like [`DFA::try_search`], but it accepts a raw slots slice
+ /// instead of a `Captures` value. This is useful in contexts where you
+ /// don't want or need to allocate a `Captures`.
+ ///
+ /// It is legal to pass _any_ number of slots to this routine. If the regex
+ /// engine would otherwise write a slot offset that doesn't fit in the
+ /// provided slice, then it is simply skipped. In general though, there are
+ /// usually three slice lengths you might want to use:
+ ///
+ /// * An empty slice, if you only care about which pattern matched.
+ /// * A slice with
+ /// [`pattern_len() * 2`](crate::dfa::onepass::DFA::pattern_len)
+ /// slots, if you only care about the overall match spans for each matching
+ /// pattern.
+ /// * A slice with
+ /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
+ /// permits recording match offsets for every capturing group in every
+ /// pattern.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode. Concretely,
+ /// this occurs when using [`Anchored::Pattern`] without enabling
+ /// [`Config::starts_for_each_pattern`].
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to find the overall match offsets in a
+ /// multi-pattern search without allocating a `Captures` value. Indeed, we
+ /// can put our slots right on the stack.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID};
+ ///
+ /// let re = DFA::new_many(&[
+ /// r"[a-zA-Z]+",
+ /// r"[0-9]+",
+ /// ])?;
+ /// let mut cache = re.create_cache();
+ /// let input = Input::new("123").anchored(Anchored::Yes);
+ ///
+ /// // We only care about the overall match offsets here, so we just
+ /// // allocate two slots for each pattern. Each slot records the start
+ /// // and end of the match.
+ /// let mut slots = [None; 4];
+ /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
+ /// assert_eq!(Some(PatternID::must(1)), pid);
+ ///
+ /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
+ /// // See 'GroupInfo' for more details on the mapping between groups and
+ /// // slot indices.
+ /// let slot_start = pid.unwrap().as_usize() * 2;
+ /// let slot_end = slot_start + 1;
+ /// assert_eq!(Some(0), slots[slot_start].map(|s| s.get()));
+ /// assert_eq!(Some(3), slots[slot_end].map(|s| s.get()));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn try_search_slots(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<PatternID>, MatchError> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ if !utf8empty {
+ return self.try_search_slots_imp(cache, input, slots);
+ }
+ // See PikeVM::try_search_slots for why we do this.
+ let min = self.get_nfa().group_info().implicit_slot_len();
+ if slots.len() >= min {
+ return self.try_search_slots_imp(cache, input, slots);
+ }
+ if self.get_nfa().pattern_len() == 1 {
+ let mut enough = [None, None];
+ let got = self.try_search_slots_imp(cache, input, &mut enough)?;
+ // This is OK because we know `enough_slots` is strictly bigger
+ // than `slots`, otherwise this special case isn't reached.
+ slots.copy_from_slice(&enough[..slots.len()]);
+ return Ok(got);
+ }
+ let mut enough = vec![None; min];
+ let got = self.try_search_slots_imp(cache, input, &mut enough)?;
+ // This is OK because we know `enough_slots` is strictly bigger than
+ // `slots`, otherwise this special case isn't reached.
+ slots.copy_from_slice(&enough[..slots.len()]);
+ Ok(got)
+ }
+
+ #[inline(never)]
+ fn try_search_slots_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<PatternID>, MatchError> {
+ let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
+ match self.search_imp(cache, input, slots)? {
+ None => return Ok(None),
+ Some(pid) if !utf8empty => return Ok(Some(pid)),
+ Some(pid) => {
+ // These slot indices are always correct because we know our
+ // 'pid' is valid and thus we know that the slot indices for it
+ // are valid.
+ let slot_start = pid.as_usize().wrapping_mul(2);
+ let slot_end = slot_start.wrapping_add(1);
+ // OK because we know we have a match and we know our caller
+ // provided slots are big enough (which we make true above if
+ // the caller didn't). Namely, we're only here when 'utf8empty'
+ // is true, and when that's true, we require slots for every
+ // pattern.
+ let start = slots[slot_start].unwrap().get();
+ let end = slots[slot_end].unwrap().get();
+ // If our match splits a codepoint, then we cannot report is
+ // as a match. And since one-pass DFAs only support anchored
+ // searches, we don't try to skip ahead to find the next match.
+ // We can just quit with nothing.
+ if start == end && !input.is_char_boundary(start) {
+ return Ok(None);
+ }
+ Ok(Some(pid))
+ }
+ }
+ }
+}
+
+impl DFA {
+ fn search_imp(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ slots: &mut [Option<NonMaxUsize>],
+ ) -> Result<Option<PatternID>, MatchError> {
+ // PERF: Some ideas. I ran out of steam after my initial impl to try
+ // many of these.
+ //
+ // 1) Try doing more state shuffling. Right now, all we do is push
+ // match states to the end of the transition table so that we can do
+ // 'if sid >= self.min_match_id' to know whether we're in a match
+ // state or not. But what about doing something like dense DFAs and
+ // pushing dead, match and states with captures/looks all toward the
+ // beginning of the transition table. Then we could do 'if sid <=
+ // self.max_special_id', in which case, we need to do some special
+ // handling of some sort. Otherwise, we get the happy path, just
+ // like in a DFA search. The main argument against this is that the
+ // one-pass DFA is likely to be used most often with capturing groups
+ // and if capturing groups are common, then this might wind up being a
+ // pessimization.
+ //
+ // 2) Consider moving 'PatternEpsilons' out of the transition table.
+ // It is only needed for match states and usually a small minority of
+ // states are match states. Therefore, we're using an extra 'u64' for
+ // most states.
+ //
+ // 3) I played around with the match state handling and it seems like
+ // there is probably a lot left on the table for improvement. The
+ // key tension is that the 'find_match' routine is a giant mess, but
+ // splitting it out into a non-inlineable function is a non-starter
+ // because the match state might consume input, so 'find_match' COULD
+ // be called quite a lot, and a function call at that point would trash
+ // perf. In theory, we could detect whether a match state consumes
+ // input and then specialize our search routine based on that. In that
+ // case, maybe an extra function call is OK, but even then, it might be
+ // too much of a latency hit. Another idea is to just try and figure
+ // out how to reduce the code size of 'find_match'. RE2 has a trick
+ // here where the match handling isn't done if we know the next byte of
+ // input yields a match too. Maybe we adopt that?
+ //
+ // This just might be a tricky DFA to optimize.
+
+ if input.is_done() {
+ return Ok(None);
+ }
+ // We unfortunately have a bit of book-keeping to do to set things
+ // up. We do have to setup our cache and clear all of our slots. In
+ // particular, clearing the slots is necessary for the case where we
+ // report a match, but one of the capturing groups didn't participate
+ // in the match but had a span set from a previous search. That would
+ // be bad. In theory, we could avoid all this slot clearing if we knew
+ // that every slot was always activated for every match. Then we would
+ // know they would always be overwritten when a match is found.
+ let explicit_slots_len = core::cmp::min(
+ Slots::LIMIT,
+ slots.len().saturating_sub(self.explicit_slot_start),
+ );
+ cache.setup_search(explicit_slots_len);
+ for slot in cache.explicit_slots() {
+ *slot = None;
+ }
+ for slot in slots.iter_mut() {
+ *slot = None;
+ }
+ // We set the starting slots for every pattern up front. This does
+ // increase our latency somewhat, but it avoids having to do it every
+ // time we see a match state (which could be many times in a single
+ // search if the match state consumes input).
+ for pid in self.nfa.patterns() {
+ let i = pid.as_usize() * 2;
+ if i >= slots.len() {
+ break;
+ }
+ slots[i] = NonMaxUsize::new(input.start());
+ }
+ let mut pid = None;
+ let mut next_sid = match input.get_anchored() {
+ Anchored::Yes => self.start(),
+ Anchored::Pattern(pid) => self.start_pattern(pid)?,
+ Anchored::No => {
+ // If the regex is itself always anchored, then we're fine,
+ // even if the search is configured to be unanchored.
+ if !self.nfa.is_always_start_anchored() {
+ return Err(MatchError::unsupported_anchored(
+ Anchored::No,
+ ));
+ }
+ self.start()
+ }
+ };
+ let leftmost_first =
+ matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst);
+ for at in input.start()..input.end() {
+ let sid = next_sid;
+ let trans = self.transition(sid, input.haystack()[at]);
+ next_sid = trans.state_id();
+ let epsilons = trans.epsilons();
+ if sid >= self.min_match_id {
+ if self.find_match(cache, input, at, sid, slots, &mut pid) {
+ if input.get_earliest()
+ || (leftmost_first && trans.match_wins())
+ {
+ return Ok(pid);
+ }
+ }
+ }
+ if sid == DEAD
+ || (!epsilons.looks().is_empty()
+ && !self.nfa.look_matcher().matches_set_inline(
+ epsilons.looks(),
+ input.haystack(),
+ at,
+ ))
+ {
+ return Ok(pid);
+ }
+ epsilons.slots().apply(at, cache.explicit_slots());
+ }
+ if next_sid >= self.min_match_id {
+ self.find_match(
+ cache,
+ input,
+ input.end(),
+ next_sid,
+ slots,
+ &mut pid,
+ );
+ }
+ Ok(pid)
+ }
+
+ /// Assumes 'sid' is a match state and looks for whether a match can
+ /// be reported. If so, appropriate offsets are written to 'slots' and
+ /// 'matched_pid' is set to the matching pattern ID.
+ ///
+ /// Even when 'sid' is a match state, it's possible that a match won't
+ /// be reported. For example, when the conditional epsilon transitions
+ /// leading to the match state aren't satisfied at the given position in
+ /// the haystack.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_match(
+ &self,
+ cache: &mut Cache,
+ input: &Input<'_>,
+ at: usize,
+ sid: StateID,
+ slots: &mut [Option<NonMaxUsize>],
+ matched_pid: &mut Option<PatternID>,
+ ) -> bool {
+ debug_assert!(sid >= self.min_match_id);
+ let pateps = self.pattern_epsilons(sid);
+ let epsilons = pateps.epsilons();
+ if !epsilons.looks().is_empty()
+ && !self.nfa.look_matcher().matches_set_inline(
+ epsilons.looks(),
+ input.haystack(),
+ at,
+ )
+ {
+ return false;
+ }
+ let pid = pateps.pattern_id_unchecked();
+ // This calculation is always correct because we know our 'pid' is
+ // valid and thus we know that the slot indices for it are valid.
+ let slot_end = pid.as_usize().wrapping_mul(2).wrapping_add(1);
+ // Set the implicit 'end' slot for the matching pattern. (The 'start'
+ // slot was set at the beginning of the search.)
+ if slot_end < slots.len() {
+ slots[slot_end] = NonMaxUsize::new(at);
+ }
+ // If the caller provided enough room, copy the previously recorded
+ // explicit slots from our scratch space to the caller provided slots.
+ // We *also* need to set any explicit slots that are active as part of
+ // the path to the match state.
+ if self.explicit_slot_start < slots.len() {
+ // NOTE: The 'cache.explicit_slots()' slice is setup at the
+ // beginning of every search such that it is guaranteed to return a
+ // slice of length equivalent to 'slots[explicit_slot_start..]'.
+ slots[self.explicit_slot_start..]
+ .copy_from_slice(cache.explicit_slots());
+ epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]);
+ }
+ *matched_pid = Some(pid);
+ true
+ }
+}
+
+impl DFA {
+ /// Returns the anchored start state for matching any pattern in this DFA.
+ fn start(&self) -> StateID {
+ self.starts[0]
+ }
+
+ /// Returns the anchored start state for matching the given pattern. If
+ /// 'starts_for_each_pattern'
+ /// was not enabled, then this returns an error. If the given pattern is
+ /// not in this DFA, then `Ok(None)` is returned.
+ fn start_pattern(&self, pid: PatternID) -> Result<StateID, MatchError> {
+ if !self.config.get_starts_for_each_pattern() {
+ return Err(MatchError::unsupported_anchored(Anchored::Pattern(
+ pid,
+ )));
+ }
+ // 'starts' always has non-zero length. The first entry is always the
+ // anchored starting state for all patterns, and the following entries
+ // are optional and correspond to the anchored starting states for
+ // patterns at pid+1. Thus, starts.len()-1 corresponds to the total
+ // number of patterns that one can explicitly search for. (And it may
+ // be zero.)
+ Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD))
+ }
+
+ /// Returns the transition from the given state ID and byte of input. The
+ /// transition includes the next state ID, the slots that should be saved
+ /// and any conditional epsilon transitions that must be satisfied in order
+ /// to take this transition.
+ fn transition(&self, sid: StateID, byte: u8) -> Transition {
+ let offset = sid.as_usize() << self.stride2();
+ let class = self.classes.get(byte).as_usize();
+ self.table[offset + class]
+ }
+
+ /// Set the transition from the given state ID and byte of input to the
+ /// transition given.
+ fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) {
+ let offset = sid.as_usize() << self.stride2();
+ let class = self.classes.get(byte).as_usize();
+ self.table[offset + class] = to;
+ }
+
+ /// Return an iterator of "sparse" transitions for the given state ID.
+ /// "sparse" in this context means that consecutive transitions that are
+ /// equivalent are returned as one group, and transitions to the DEAD state
+ /// are ignored.
+ ///
+ /// This winds up being useful for debug printing, since it's much terser
+ /// to display runs of equivalent transitions than the transition for every
+ /// possible byte value. Indeed, in practice, it's very common for runs
+ /// of equivalent transitions to appear.
+ fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> {
+ let start = sid.as_usize() << self.stride2();
+ let end = start + self.alphabet_len();
+ SparseTransitionIter {
+ it: self.table[start..end].iter().enumerate(),
+ cur: None,
+ }
+ }
+
+ /// Return the pattern epsilons for the given state ID.
+ ///
+ /// If the given state ID does not correspond to a match state ID, then the
+ /// pattern epsilons returned is empty.
+ fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons {
+ let offset = sid.as_usize() << self.stride2();
+ PatternEpsilons(self.table[offset + self.pateps_offset].0)
+ }
+
+ /// Set the pattern epsilons for the given state ID.
+ fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) {
+ let offset = sid.as_usize() << self.stride2();
+ self.table[offset + self.pateps_offset] = Transition(pateps.0);
+ }
+
+ /// Returns the state ID prior to the one given. This returns None if the
+ /// given ID is the first DFA state.
+ fn prev_state_id(&self, id: StateID) -> Option<StateID> {
+ if id == DEAD {
+ None
+ } else {
+ // CORRECTNESS: Since 'id' is not the first state, subtracting 1
+ // is always valid.
+ Some(StateID::new_unchecked(id.as_usize().checked_sub(1).unwrap()))
+ }
+ }
+
+ /// Returns the state ID of the last state in this DFA's transition table.
+ /// "last" in this context means the last state to appear in memory, i.e.,
+ /// the one with the greatest ID.
+ fn last_state_id(&self) -> StateID {
+ // CORRECTNESS: A DFA table is always non-empty since it always at
+ // least contains a DEAD state. Since every state has the same stride,
+ // we can just compute what the "next" state ID would have been and
+ // then subtract 1 from it.
+ StateID::new_unchecked(
+ (self.table.len() >> self.stride2()).checked_sub(1).unwrap(),
+ )
+ }
+
+ /// Move the transitions from 'id1' to 'id2' and vice versa.
+ ///
+ /// WARNING: This does not update the rest of the transition table to have
+ /// transitions to 'id1' changed to 'id2' and vice versa. This merely moves
+ /// the states in memory.
+ pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ let o1 = id1.as_usize() << self.stride2();
+ let o2 = id2.as_usize() << self.stride2();
+ for b in 0..self.stride() {
+ self.table.swap(o1 + b, o2 + b);
+ }
+ }
+
+ /// Map all state IDs in this DFA (transition table + start states)
+ /// according to the closure given.
+ pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ for i in 0..self.state_len() {
+ let offset = i << self.stride2();
+ for b in 0..self.alphabet_len() {
+ let next = self.table[offset + b].state_id();
+ self.table[offset + b].set_state_id(map(next));
+ }
+ }
+ for i in 0..self.starts.len() {
+ self.starts[i] = map(self.starts[i]);
+ }
+ }
+}
+
+impl core::fmt::Debug for DFA {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ fn debug_state_transitions(
+ f: &mut core::fmt::Formatter,
+ dfa: &DFA,
+ sid: StateID,
+ ) -> core::fmt::Result {
+ for (i, (start, end, trans)) in
+ dfa.sparse_transitions(sid).enumerate()
+ {
+ let next = trans.state_id();
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ if start == end {
+ write!(
+ f,
+ "{:?} => {:?}",
+ DebugByte(start),
+ next.as_usize(),
+ )?;
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next.as_usize(),
+ )?;
+ }
+ if trans.match_wins() {
+ write!(f, " (MW)")?;
+ }
+ if !trans.epsilons().is_empty() {
+ write!(f, " ({:?})", trans.epsilons())?;
+ }
+ }
+ Ok(())
+ }
+
+ writeln!(f, "onepass::DFA(")?;
+ for index in 0..self.state_len() {
+ let sid = StateID::must(index);
+ let pateps = self.pattern_epsilons(sid);
+ if sid == DEAD {
+ write!(f, "D ")?;
+ } else if pateps.pattern_id().is_some() {
+ write!(f, "* ")?;
+ } else {
+ write!(f, " ")?;
+ }
+ write!(f, "{:06?}", sid.as_usize())?;
+ if !pateps.is_empty() {
+ write!(f, " ({:?})", pateps)?;
+ }
+ write!(f, ": ")?;
+ debug_state_transitions(f, self, sid)?;
+ write!(f, "\n")?;
+ }
+ writeln!(f, "")?;
+ for (i, &sid) in self.starts.iter().enumerate() {
+ if i == 0 {
+ writeln!(f, "START(ALL): {:?}", sid.as_usize())?;
+ } else {
+ writeln!(
+ f,
+ "START(pattern: {:?}): {:?}",
+ i - 1,
+ sid.as_usize(),
+ )?;
+ }
+ }
+ writeln!(f, "state length: {:?}", self.state_len())?;
+ writeln!(f, "pattern length: {:?}", self.pattern_len())?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// An iterator over groups of consecutive equivalent transitions in a single
+/// state.
+#[derive(Debug)]
+struct SparseTransitionIter<'a> {
+ it: core::iter::Enumerate<core::slice::Iter<'a, Transition>>,
+ cur: Option<(u8, u8, Transition)>,
+}
+
+impl<'a> Iterator for SparseTransitionIter<'a> {
+ type Item = (u8, u8, Transition);
+
+ fn next(&mut self) -> Option<(u8, u8, Transition)> {
+ while let Some((b, &trans)) = self.it.next() {
+ // Fine because we'll never have more than u8::MAX transitions in
+ // one state.
+ let b = b.as_u8();
+ let (prev_start, prev_end, prev_trans) = match self.cur {
+ Some(t) => t,
+ None => {
+ self.cur = Some((b, b, trans));
+ continue;
+ }
+ };
+ if prev_trans == trans {
+ self.cur = Some((prev_start, b, prev_trans));
+ } else {
+ self.cur = Some((b, b, trans));
+ if prev_trans.state_id() != DEAD {
+ return Some((prev_start, prev_end, prev_trans));
+ }
+ }
+ }
+ if let Some((start, end, trans)) = self.cur.take() {
+ if trans.state_id() != DEAD {
+ return Some((start, end, trans));
+ }
+ }
+ None
+ }
+}
+
+/// A cache represents mutable state that a one-pass [`DFA`] requires during a
+/// search.
+///
+/// For a given one-pass DFA, its corresponding cache may be created either via
+/// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every
+/// way, except the former does not require explicitly importing `Cache`.
+///
+/// A particular `Cache` is coupled with the one-pass DFA from which it was
+/// created. It may only be used with that one-pass DFA. A cache and its
+/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
+/// only be used with the new one-pass DFA (and not the old one).
+#[derive(Clone, Debug)]
+pub struct Cache {
+ /// Scratch space used to store slots during a search. Basically, we use
+ /// the caller provided slots to store slots known when a match occurs.
+ /// But after a match occurs, we might continue a search but ultimately
+ /// fail to extend the match. When continuing the search, we need some
+ /// place to store candidate capture offsets without overwriting the slot
+ /// offsets recorded for the most recently seen match.
+ explicit_slots: Vec<Option<NonMaxUsize>>,
+ /// The number of slots in the caller-provided 'Captures' value for the
+ /// current search. This is always at most 'explicit_slots.len()', but
+ /// might be less than it, if the caller provided fewer slots to fill.
+ explicit_slot_len: usize,
+}
+
+impl Cache {
+ /// Create a new [`onepass::DFA`](DFA) cache.
+ ///
+ /// A potentially more convenient routine to create a cache is
+ /// [`DFA::create_cache`], as it does not require also importing the
+ /// `Cache` type.
+ ///
+ /// If you want to reuse the returned `Cache` with some other one-pass DFA,
+ /// then you must call [`Cache::reset`] with the desired one-pass DFA.
+ pub fn new(re: &DFA) -> Cache {
+ let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: 0 };
+ cache.reset(re);
+ cache
+ }
+
+ /// Reset this cache such that it can be used for searching with a
+ /// different [`onepass::DFA`](DFA).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different one-pass DFA.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different one-pass
+ /// DFA.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::onepass::DFA, Match};
+ ///
+ /// let re1 = DFA::new(r"\w")?;
+ /// let re2 = DFA::new(r"\W")?;
+ /// let mut caps1 = re1.create_captures();
+ /// let mut caps2 = re2.create_captures();
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..2)),
+ /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() },
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the one-pass DFA we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// re2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(Match::must(0, 0..3)),
+ /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() },
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, re: &DFA) {
+ let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len();
+ self.explicit_slots.resize(explicit_slot_len, None);
+ self.explicit_slot_len = explicit_slot_len;
+ }
+
+ /// Returns the heap memory usage, in bytes, of this cache.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.explicit_slots.len() * core::mem::size_of::<Option<NonMaxUsize>>()
+ }
+
+ fn explicit_slots(&mut self) -> &mut [Option<NonMaxUsize>] {
+ &mut self.explicit_slots[..self.explicit_slot_len]
+ }
+
+ fn setup_search(&mut self, explicit_slot_len: usize) {
+ self.explicit_slot_len = explicit_slot_len;
+ }
+}
+
+/// Represents a single transition in a one-pass DFA.
+///
+/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds
+/// to the transition epsilons, which contains the slots that should be saved
+/// when this transition is followed and the conditional epsilon transitions
+/// that must be satisfied in order to follow this transition.
+#[derive(Clone, Copy, Eq, PartialEq)]
+struct Transition(u64);
+
+impl Transition {
+ const STATE_ID_BITS: u64 = 21;
+ const STATE_ID_SHIFT: u64 = 64 - Transition::STATE_ID_BITS;
+ const STATE_ID_LIMIT: u64 = 1 << Transition::STATE_ID_BITS;
+ const MATCH_WINS_SHIFT: u64 = 64 - (Transition::STATE_ID_BITS + 1);
+ const INFO_MASK: u64 = 0x000003FF_FFFFFFFF;
+
+ /// Return a new transition to the given state ID with the given epsilons.
+ fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition {
+ let match_wins =
+ if match_wins { 1 << Transition::MATCH_WINS_SHIFT } else { 0 };
+ let sid = sid.as_u64() << Transition::STATE_ID_SHIFT;
+ Transition(sid | match_wins | epsilons.0)
+ }
+
+ /// Returns true if and only if this transition points to the DEAD state.
+ fn is_dead(self) -> bool {
+ self.state_id() == DEAD
+ }
+
+ /// Return whether this transition has a "match wins" property.
+ ///
+ /// When a transition has this property, it means that if a match has been
+ /// found and the search uses leftmost-first semantics, then that match
+ /// should be returned immediately instead of continuing on.
+ ///
+ /// The "match wins" name comes from RE2, which uses a pretty much
+ /// identical mechanism for implementing leftmost-first semantics.
+ fn match_wins(&self) -> bool {
+ (self.0 >> Transition::MATCH_WINS_SHIFT & 1) == 1
+ }
+
+ /// Return the "next" state ID that this transition points to.
+ fn state_id(&self) -> StateID {
+ // OK because a Transition has a valid StateID in its upper bits by
+ // construction. The cast to usize is also correct, even on 16-bit
+ // targets because, again, we know the upper bits is a valid StateID,
+ // which can never overflow usize on any supported target.
+ StateID::new_unchecked(
+ (self.0 >> Transition::STATE_ID_SHIFT).as_usize(),
+ )
+ }
+
+ /// Set the "next" state ID in this transition.
+ fn set_state_id(&mut self, sid: StateID) {
+ *self = Transition::new(self.match_wins(), sid, self.epsilons());
+ }
+
+ /// Return the epsilons embedded in this transition.
+ fn epsilons(&self) -> Epsilons {
+ Epsilons(self.0 & Transition::INFO_MASK)
+ }
+}
+
+impl core::fmt::Debug for Transition {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ if self.is_dead() {
+ return write!(f, "0");
+ }
+ write!(f, "{}", self.state_id().as_usize())?;
+ if self.match_wins() {
+ write!(f, "-MW")?;
+ }
+ if !self.epsilons().is_empty() {
+ write!(f, "-{:?}", self.epsilons())?;
+ }
+ Ok(())
+ }
+}
+
+/// A representation of a match state's pattern ID along with the epsilons for
+/// when a match occurs.
+///
+/// A match state in a one-pass DFA, unlike in a more general DFA, has exactly
+/// one pattern ID. If it had more, then the original NFA would not have been
+/// one-pass.
+///
+/// The "epsilons" part of this corresponds to what was found in the epsilon
+/// transitions between the transition taken in the last byte of input and the
+/// ultimate match state. This might include saving slots and/or conditional
+/// epsilon transitions that must be satisfied before one can report the match.
+///
+/// Technically, every state has room for a 'PatternEpsilons', but it is only
+/// ever non-empty for match states.
+#[derive(Clone, Copy)]
+struct PatternEpsilons(u64);
+
+impl PatternEpsilons {
+ const PATTERN_ID_BITS: u64 = 22;
+ const PATTERN_ID_SHIFT: u64 = 64 - PatternEpsilons::PATTERN_ID_BITS;
+ // A sentinel value indicating that this is not a match state. We don't
+ // use 0 since 0 is a valid pattern ID.
+ const PATTERN_ID_NONE: u64 = 0x00000000_003FFFFF;
+ const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE;
+ const PATTERN_ID_MASK: u64 = 0xFFFFFC00_00000000;
+ const EPSILONS_MASK: u64 = 0x000003FF_FFFFFFFF;
+
+ /// Return a new empty pattern epsilons that has no pattern ID and has no
+ /// epsilons. This is suitable for non-match states.
+ fn empty() -> PatternEpsilons {
+ PatternEpsilons(
+ PatternEpsilons::PATTERN_ID_NONE
+ << PatternEpsilons::PATTERN_ID_SHIFT,
+ )
+ }
+
+ /// Whether this pattern epsilons is empty or not. It's empty when it has
+ /// no pattern ID and an empty epsilons.
+ fn is_empty(self) -> bool {
+ self.pattern_id().is_none() && self.epsilons().is_empty()
+ }
+
+ /// Return the pattern ID in this pattern epsilons if one exists.
+ fn pattern_id(self) -> Option<PatternID> {
+ let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
+ if pid == PatternEpsilons::PATTERN_ID_LIMIT {
+ None
+ } else {
+ Some(PatternID::new_unchecked(pid.as_usize()))
+ }
+ }
+
+ /// Returns the pattern ID without checking whether it's valid. If this is
+ /// called and there is no pattern ID in this `PatternEpsilons`, then this
+ /// will likely produce an incorrect result or possibly even a panic or
+ /// an overflow. But safety will not be violated.
+ ///
+ /// This is useful when you know a particular state is a match state. If
+ /// it's a match state, then it must have a pattern ID.
+ fn pattern_id_unchecked(self) -> PatternID {
+ let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT;
+ PatternID::new_unchecked(pid.as_usize())
+ }
+
+ /// Return a new pattern epsilons with the given pattern ID, but the same
+ /// epsilons.
+ fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons {
+ PatternEpsilons(
+ (pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT)
+ | (self.0 & PatternEpsilons::EPSILONS_MASK),
+ )
+ }
+
+ /// Return the epsilons part of this pattern epsilons.
+ fn epsilons(self) -> Epsilons {
+ Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK)
+ }
+
+ /// Return a new pattern epsilons with the given epsilons, but the same
+ /// pattern ID.
+ fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons {
+ PatternEpsilons(
+ (self.0 & PatternEpsilons::PATTERN_ID_MASK)
+ | u64::from(epsilons.0),
+ )
+ }
+}
+
+impl core::fmt::Debug for PatternEpsilons {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ if self.is_empty() {
+ return write!(f, "N/A");
+ }
+ if let Some(pid) = self.pattern_id() {
+ write!(f, "{}", pid.as_usize())?;
+ }
+ if !self.epsilons().is_empty() {
+ if self.pattern_id().is_some() {
+ write!(f, "/")?;
+ }
+ write!(f, "{:?}", self.epsilons())?;
+ }
+ Ok(())
+ }
+}
+
+/// Epsilons represents all of the NFA epsilons transitions that went into a
+/// single transition in a single DFA state. In this case, it only represents
+/// the epsilon transitions that have some kind of non-consuming side effect:
+/// either the transition requires storing the current position of the search
+/// into a slot, or the transition is conditional and requires the current
+/// position in the input to satisfy an assertion before the transition may be
+/// taken.
+///
+/// This folds the cumulative effect of a group of NFA states (all connected
+/// by epsilon transitions) down into a single set of bits. While these bits
+/// can represent all possible conditional epsilon transitions, it only permits
+/// storing up to a somewhat small number of slots.
+///
+/// Epsilons is represented as a 42-bit integer. For example, it is packed into
+/// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a
+/// `StateID` and a special "match wins" property.)
+#[derive(Clone, Copy)]
+struct Epsilons(u64);
+
+impl Epsilons {
+ const SLOT_MASK: u64 = 0x000003FF_FFFFFC00;
+ const SLOT_SHIFT: u64 = 10;
+ const LOOK_MASK: u64 = 0x00000000_000003FF;
+
+ /// Create a new empty epsilons. It has no slots and no assertions that
+ /// need to be satisfied.
+ fn empty() -> Epsilons {
+ Epsilons(0)
+ }
+
+ /// Returns true if this epsilons contains no slots and no assertions.
+ fn is_empty(self) -> bool {
+ self.0 == 0
+ }
+
+ /// Returns the slot epsilon transitions.
+ fn slots(self) -> Slots {
+ Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32())
+ }
+
+ /// Set the slot epsilon transitions.
+ fn set_slots(self, slots: Slots) -> Epsilons {
+ Epsilons(
+ (u64::from(slots.0) << Epsilons::SLOT_SHIFT)
+ | (self.0 & Epsilons::LOOK_MASK),
+ )
+ }
+
+ /// Return the set of look-around assertions in these epsilon transitions.
+ fn looks(self) -> LookSet {
+ LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() }
+ }
+
+ /// Set the look-around assertions on these epsilon transitions.
+ fn set_looks(self, look_set: LookSet) -> Epsilons {
+ Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits))
+ }
+}
+
+impl core::fmt::Debug for Epsilons {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut wrote = false;
+ if !self.slots().is_empty() {
+ write!(f, "{:?}", self.slots())?;
+ wrote = true;
+ }
+ if !self.looks().is_empty() {
+ if wrote {
+ write!(f, "/")?;
+ }
+ write!(f, "{:?}", self.looks())?;
+ wrote = true;
+ }
+ if !wrote {
+ write!(f, "N/A")?;
+ }
+ Ok(())
+ }
+}
+
+/// The set of epsilon transitions indicating that the current position in a
+/// search should be saved to a slot.
+///
+/// This *only* represents explicit slots. So for example, the pattern
+/// `[a-z]+([0-9]+)([a-z]+)` has:
+///
+/// * 3 capturing groups, thus 6 slots.
+/// * 1 implicit capturing group, thus 2 implicit slots.
+/// * 2 explicit capturing groups, thus 4 explicit slots.
+///
+/// While implicit slots are represented by epsilon transitions in an NFA, we
+/// do not explicitly represent them here. Instead, implicit slots are assumed
+/// to be present and handled automatically in the search code. Therefore,
+/// that means we only need to represent explicit slots in our epsilon
+/// transitions.
+///
+/// Its representation is a bit set. The bit 'i' is set if and only if there
+/// exists an explicit slot at index 'c', where 'c = (#patterns * 2) + i'. That
+/// is, the bit 'i' corresponds to the first explicit slot and the first
+/// explicit slot appears immediately following the last implicit slot. (If
+/// this is confusing, see `GroupInfo` for more details on how slots works.)
+///
+/// A single `Slots` represents all the active slots in a sub-graph of an NFA,
+/// where all the states are connected by epsilon transitions. In effect, when
+/// traversing the one-pass DFA during a search, all slots set in a particular
+/// transition must be captured by recording the current search position.
+///
+/// The API of `Slots` requires the caller to handle the explicit slot offset.
+/// That is, a `Slots` doesn't know where the explicit slots start for a
+/// particular NFA. Thus, if the callers see's the bit 'i' is set, then they
+/// need to do the arithmetic above to find 'c', which is the real actual slot
+/// index in the corresponding NFA.
+#[derive(Clone, Copy)]
+struct Slots(u32);
+
+impl Slots {
+ const LIMIT: usize = 32;
+
+ /// Insert the slot at the given bit index.
+ fn insert(self, slot: usize) -> Slots {
+ debug_assert!(slot < Slots::LIMIT);
+ Slots(self.0 | (1 << slot.as_u32()))
+ }
+
+ /// Remove the slot at the given bit index.
+ fn remove(self, slot: usize) -> Slots {
+ debug_assert!(slot < Slots::LIMIT);
+ Slots(self.0 & !(1 << slot.as_u32()))
+ }
+
+ /// Returns true if and only if this set contains no slots.
+ fn is_empty(self) -> bool {
+ self.0 == 0
+ }
+
+ /// Returns an iterator over all of the set bits in this set.
+ fn iter(self) -> SlotsIter {
+ SlotsIter { slots: self }
+ }
+
+ /// For the position `at` in the current haystack, copy it to
+ /// `caller_explicit_slots` for all slots that are in this set.
+ ///
+ /// Callers may pass a slice of any length. Slots in this set bigger than
+ /// the length of the given explicit slots are simply skipped.
+ ///
+ /// The slice *must* correspond only to the explicit slots and the first
+ /// element of the slice must always correspond to the first explicit slot
+ /// in the corresponding NFA.
+ fn apply(
+ self,
+ at: usize,
+ caller_explicit_slots: &mut [Option<NonMaxUsize>],
+ ) {
+ if self.is_empty() {
+ return;
+ }
+ let at = NonMaxUsize::new(at);
+ for slot in self.iter() {
+ if slot >= caller_explicit_slots.len() {
+ break;
+ }
+ caller_explicit_slots[slot] = at;
+ }
+ }
+}
+
+impl core::fmt::Debug for Slots {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(f, "S")?;
+ for slot in self.iter() {
+ write!(f, "-{:?}", slot)?;
+ }
+ Ok(())
+ }
+}
+
+/// An iterator over all of the bits set in a slot set.
+///
+/// This returns the bit index that is set, so callers may need to offset it
+/// to get the actual NFA slot index.
+#[derive(Debug)]
+struct SlotsIter {
+ slots: Slots,
+}
+
+impl Iterator for SlotsIter {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ // Number of zeroes here is always <= u8::MAX, and so fits in a usize.
+ let slot = self.slots.0.trailing_zeros().as_usize();
+ if slot >= Slots::LIMIT {
+ return None;
+ }
+ self.slots = self.slots.remove(slot);
+ Some(slot)
+ }
+}
+
+/// An error that occurred during the construction of a one-pass DFA.
+///
+/// This error does not provide many introspection capabilities. There are
+/// generally only two things you can do with it:
+///
+/// * Obtain a human readable message via its `std::fmt::Display` impl.
+/// * Access an underlying [`thompson::BuildError`] type from its `source`
+/// method via the `std::error::Error` trait. This error only occurs when using
+/// convenience routines for building a one-pass DFA directly from a pattern
+/// string.
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: BuildErrorKind,
+}
+
+/// The kind of error that occurred during the construction of a one-pass DFA.
+#[derive(Clone, Debug)]
+enum BuildErrorKind {
+ NFA(crate::nfa::thompson::BuildError),
+ Word(UnicodeWordBoundaryError),
+ TooManyStates { limit: u64 },
+ TooManyPatterns { limit: u64 },
+ UnsupportedLook { look: Look },
+ ExceededSizeLimit { limit: usize },
+ NotOnePass { msg: &'static str },
+}
+
+impl BuildError {
+ fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError {
+ BuildError { kind: BuildErrorKind::NFA(err) }
+ }
+
+ fn word(err: UnicodeWordBoundaryError) -> BuildError {
+ BuildError { kind: BuildErrorKind::Word(err) }
+ }
+
+ fn too_many_states(limit: u64) -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyStates { limit } }
+ }
+
+ fn too_many_patterns(limit: u64) -> BuildError {
+ BuildError { kind: BuildErrorKind::TooManyPatterns { limit } }
+ }
+
+ fn unsupported_look(look: Look) -> BuildError {
+ BuildError { kind: BuildErrorKind::UnsupportedLook { look } }
+ }
+
+ fn exceeded_size_limit(limit: usize) -> BuildError {
+ BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
+ }
+
+ fn not_one_pass(msg: &'static str) -> BuildError {
+ BuildError { kind: BuildErrorKind::NotOnePass { msg } }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for BuildError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ use self::BuildErrorKind::*;
+
+ match self.kind {
+ NFA(ref err) => Some(err),
+ Word(ref err) => Some(err),
+ _ => None,
+ }
+ }
+}
+
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ use self::BuildErrorKind::*;
+
+ match self.kind {
+ NFA(_) => write!(f, "error building NFA"),
+ Word(_) => write!(f, "NFA contains Unicode word boundary"),
+ TooManyStates { limit } => write!(
+ f,
+ "one-pass DFA exceeded a limit of {:?} for number of states",
+ limit,
+ ),
+ TooManyPatterns { limit } => write!(
+ f,
+ "one-pass DFA exceeded a limit of {:?} for number of patterns",
+ limit,
+ ),
+ UnsupportedLook { look } => write!(
+ f,
+ "one-pass DFA does not support the {:?} assertion",
+ look,
+ ),
+ ExceededSizeLimit { limit } => write!(
+ f,
+ "one-pass DFA exceeded size limit of {:?} during building",
+ limit,
+ ),
+ NotOnePass { msg } => write!(
+ f,
+ "one-pass DFA could not be built because \
+ pattern is not one-pass: {}",
+ msg,
+ ),
+ }
+ }
+}
+
+#[cfg(all(test, feature = "syntax"))]
+mod tests {
+ use alloc::string::ToString;
+
+ use super::*;
+
+ #[test]
+ fn fail_conflicting_transition() {
+ let predicate = |err: &str| err.contains("conflicting transition");
+
+ let err = DFA::new(r"a*[ab]").unwrap_err().to_string();
+ assert!(predicate(&err), "{}", err);
+ }
+
+ #[test]
+ fn fail_multiple_epsilon() {
+ let predicate = |err: &str| {
+ err.contains("multiple epsilon transitions to same state")
+ };
+
+ let err = DFA::new(r"(^|$)a").unwrap_err().to_string();
+ assert!(predicate(&err), "{}", err);
+ }
+
+ #[test]
+ fn fail_multiple_match() {
+ let predicate = |err: &str| {
+ err.contains("multiple epsilon transitions to match state")
+ };
+
+ let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string();
+ assert!(predicate(&err), "{}", err);
+ }
+
+ // This test is meant to build a one-pass regex with the maximum number of
+ // possible slots.
+ //
+ // NOTE: Remember that the slot limit only applies to explicit capturing
+ // groups. Any number of implicit capturing groups is supported (up to the
+ // maximum number of supported patterns), since implicit groups are handled
+ // by the search loop itself.
+ #[test]
+ fn max_slots() {
+ // One too many...
+ let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)";
+ assert!(DFA::new(pat).is_err());
+ // Just right.
+ let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)";
+ assert!(DFA::new(pat).is_ok());
+ }
+
+ // This test ensures that the one-pass DFA works with all look-around
+ // assertions that we expect it to work with.
+ //
+ // The utility of this test is that each one-pass transition has a small
+ // amount of space to store look-around assertions. Currently, there is
+ // logic in the one-pass constructor to ensure there aren't more than ten
+ // possible assertions. And indeed, there are only ten possible assertions
+ // (at time of writing), so this is okay. But conceivably, more assertions
+ // could be added. So we check that things at least work with what we
+ // expect them to work with.
+ #[test]
+ fn assertions() {
+ // haystack anchors
+ assert!(DFA::new(r"^").is_ok());
+ assert!(DFA::new(r"$").is_ok());
+
+ // line anchors
+ assert!(DFA::new(r"(?m)^").is_ok());
+ assert!(DFA::new(r"(?m)$").is_ok());
+ assert!(DFA::new(r"(?Rm)^").is_ok());
+ assert!(DFA::new(r"(?Rm)$").is_ok());
+
+ // word boundaries
+ if cfg!(feature = "unicode-word-boundary") {
+ assert!(DFA::new(r"\b").is_ok());
+ assert!(DFA::new(r"\B").is_ok());
+ }
+ assert!(DFA::new(r"(?-u)\b").is_ok());
+ assert!(DFA::new(r"(?-u)\B").is_ok());
+ }
+
+ #[cfg(not(miri))] // takes too long on miri
+ #[test]
+ fn is_one_pass() {
+ use crate::util::syntax;
+
+ assert!(DFA::new(r"a*b").is_ok());
+ if cfg!(feature = "unicode-perl") {
+ assert!(DFA::new(r"\w").is_ok());
+ }
+ assert!(DFA::new(r"(?-u)\w*\s").is_ok());
+ assert!(DFA::new(r"(?s:.)*?").is_ok());
+ assert!(DFA::builder()
+ .syntax(syntax::Config::new().utf8(false))
+ .build(r"(?s-u:.)*?")
+ .is_ok());
+ }
+
+ #[test]
+ fn is_not_one_pass() {
+ assert!(DFA::new(r"a*a").is_err());
+ assert!(DFA::new(r"(?s-u:.)*?").is_err());
+ assert!(DFA::new(r"(?s:.)*?a").is_err());
+ }
+
+ #[cfg(not(miri))]
+ #[test]
+ fn is_not_one_pass_bigger() {
+ assert!(DFA::new(r"\w*\s").is_err());
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/regex.rs b/third_party/rust/regex-automata/src/dfa/regex.rs
new file mode 100644
index 0000000000..f39c1c055c
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/regex.rs
@@ -0,0 +1,871 @@
+/*!
+A DFA-backed `Regex`.
+
+This module provides [`Regex`], which is defined generically over the
+[`Automaton`] trait. A `Regex` implements convenience routines you might have
+come to expect, such as finding the start/end of a match and iterating over
+all non-overlapping matches. This `Regex` type is limited in its capabilities
+to what a DFA can provide. Therefore, APIs involving capturing groups, for
+example, are not provided.
+
+Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
+finds the end offset of a match, where as the other is a "reverse" DFA that
+find the start offset of a match.
+
+See the [parent module](crate::dfa) for examples.
+*/
+
+#[cfg(feature = "alloc")]
+use alloc::vec::Vec;
+
+#[cfg(feature = "dfa-build")]
+use crate::dfa::dense::BuildError;
+use crate::{
+ dfa::{automaton::Automaton, dense},
+ util::{iter, search::Input},
+ Anchored, Match, MatchError,
+};
+#[cfg(feature = "alloc")]
+use crate::{
+ dfa::{sparse, StartKind},
+ util::search::MatchKind,
+};
+
+// When the alloc feature is enabled, the regex type sets its A type parameter
+// to default to an owned dense DFA. But without alloc, we set no default. This
+// makes things a lot more convenient in the common case, since writing out the
+// DFA types is pretty annoying.
+//
+// Since we have two different definitions but only want to write one doc
+// string, we use a macro to capture the doc and other attributes once and then
+// repeat them for each definition.
+macro_rules! define_regex_type {
+ ($(#[$doc:meta])*) => {
+ #[cfg(feature = "alloc")]
+ $(#[$doc])*
+ pub struct Regex<A = dense::OwnedDFA> {
+ forward: A,
+ reverse: A,
+ }
+
+ #[cfg(not(feature = "alloc"))]
+ $(#[$doc])*
+ pub struct Regex<A> {
+ forward: A,
+ reverse: A,
+ }
+ };
+}
+
+define_regex_type!(
+ /// A regular expression that uses deterministic finite automata for fast
+ /// searching.
+ ///
+ /// A regular expression is comprised of two DFAs, a "forward" DFA and a
+ /// "reverse" DFA. The forward DFA is responsible for detecting the end of
+ /// a match while the reverse DFA is responsible for detecting the start
+ /// of a match. Thus, in order to find the bounds of any given match, a
+ /// forward search must first be run followed by a reverse search. A match
+ /// found by the forward DFA guarantees that the reverse DFA will also find
+ /// a match.
+ ///
+ /// The type of the DFA used by a `Regex` corresponds to the `A` type
+ /// parameter, which must satisfy the [`Automaton`] trait. Typically,
+ /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
+ /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
+ /// memory but search faster, while sparse DFAs use less memory but search
+ /// more slowly.
+ ///
+ /// # Crate features
+ ///
+ /// Note that despite what the documentation auto-generates, the _only_
+ /// crate feature needed to use this type is `dfa-search`. You do _not_
+ /// need to enable the `alloc` feature.
+ ///
+ /// By default, a regex's automaton type parameter is set to
+ /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
+ /// in-memory work loads, this is the most convenient type that gives the
+ /// best search performance. When the `alloc` feature is disabled, no
+ /// default type is used.
+ ///
+ /// # When should I use this?
+ ///
+ /// Generally speaking, if you can afford the overhead of building a full
+ /// DFA for your regex, and you don't need things like capturing groups,
+ /// then this is a good choice if you're looking to optimize for matching
+ /// speed. Note however that its speed may be worse than a general purpose
+ /// regex engine if you don't provide a [`dense::Config::prefilter`] to the
+ /// underlying DFA.
+ ///
+ /// # Sparse DFAs
+ ///
+ /// Since a `Regex` is generic over the [`Automaton`] trait, it can be
+ /// used with any kind of DFA. While this crate constructs dense DFAs by
+ /// default, it is easy enough to build corresponding sparse DFAs, and then
+ /// build a regex from them:
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// // First, build a regex that uses dense DFAs.
+ /// let dense_re = Regex::new("foo[0-9]+")?;
+ ///
+ /// // Second, build sparse DFAs from the forward and reverse dense DFAs.
+ /// let fwd = dense_re.forward().to_sparse()?;
+ /// let rev = dense_re.reverse().to_sparse()?;
+ ///
+ /// // Third, build a new regex from the constituent sparse DFAs.
+ /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
+ ///
+ /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+ /// assert_eq!(true, sparse_re.is_match(b"foo123"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// Alternatively, one can use a [`Builder`] to construct a sparse DFA
+ /// more succinctly. (Note though that dense DFAs are still constructed
+ /// first internally, and then converted to sparse DFAs, as in the example
+ /// above.)
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
+ /// // A regex that uses sparse DFAs can be used just like with dense DFAs.
+ /// assert!(sparse_re.is_match(b"foo123"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Fallibility
+ ///
+ /// Most of the search routines defined on this type will _panic_ when the
+ /// underlying search fails. This might be because the DFA gave up because
+ /// it saw a quit byte, whether configured explicitly or via heuristic
+ /// Unicode word boundary support, although neither are enabled by default.
+ /// Or it might fail because an invalid `Input` configuration is given,
+ /// for example, with an unsupported [`Anchored`] mode.
+ ///
+ /// If you need to handle these error cases instead of allowing them to
+ /// trigger a panic, then the lower level [`Regex::try_search`] provides
+ /// a fallible API that never panics.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to cause a search to terminate if it sees a
+ /// `\n` byte, and handle the error returned. This could be useful if, for
+ /// example, you wanted to prevent a user supplied pattern from matching
+ /// across a line boundary.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
+ ///
+ /// let re = Regex::builder()
+ /// .dense(dfa::dense::Config::new().quit(b'\n', true))
+ /// .build(r"foo\p{any}+bar")?;
+ ///
+ /// let input = Input::new("foo\nbar");
+ /// // Normally this would produce a match, since \p{any} contains '\n'.
+ /// // But since we instructed the automaton to enter a quit state if a
+ /// // '\n' is observed, this produces a match error instead.
+ /// let expected = MatchError::quit(b'\n', 3);
+ /// let got = re.try_search(&input).unwrap_err();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[derive(Clone, Debug)]
+);
+
+#[cfg(all(feature = "syntax", feature = "dfa-build"))]
+impl Regex {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding regex.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(
+ /// Some(Match::must(0, 3..14)),
+ /// re.find(b"zzzfoo12345barzzz"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<Regex, BuildError> {
+ Builder::new().build(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "regex set."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
+ ///
+ /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ Builder::new().build_many(patterns)
+ }
+}
+
+#[cfg(all(feature = "syntax", feature = "dfa-build"))]
+impl Regex<sparse::DFA<Vec<u8>>> {
+ /// Parse the given regular expression using the default configuration,
+ /// except using sparse DFAs, and return the corresponding regex.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_sparse("foo[0-9]+bar")?;
+ /// assert_eq!(
+ /// Some(Match::must(0, 3..14)),
+ /// re.find(b"zzzfoo12345barzzz"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_sparse(
+ pattern: &str,
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
+ Builder::new().build_sparse(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "regex set"
+ /// using sparse DFAs. This otherwise similarly uses the default regex
+ /// configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
+ ///
+ /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many_sparse<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
+ Builder::new().build_many_sparse(patterns)
+ }
+}
+
+/// Convenience routines for regex construction.
+impl Regex<dense::DFA<&'static [u32]>> {
+ /// Return a builder for configuring the construction of a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{
+ /// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(Match::must(0, 1..9));
+ /// let got = re.find(haystack);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+}
+
+/// Standard search routines for finding and iterating over matches.
+impl<A: Automaton> Regex<A> {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// assert_eq!(true, re.is_match("foo12345bar"));
+ /// assert_eq!(false, re.is_match("foobar"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
+ // Not only can we do an "earliest" search, but we can avoid doing a
+ // reverse scan too.
+ let input = input.into().earliest(true);
+ self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// // Greediness is applied appropriately.
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the default leftmost-first match semantics demand that we find the
+ /// // earliest match that prefers earlier parts of the pattern over latter
+ /// // parts.
+ /// let re = Regex::new("abc|a")?;
+ /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
+ self.try_search(&input.into()).unwrap()
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// This corresponds to the "standard" regex search iterator.
+ ///
+ /// # Panics
+ ///
+ /// If the search returns an error during iteration, then iteration
+ /// panics. See [`Regex::find`] for the panic conditions.
+ ///
+ /// Use [`Regex::try_search`] with
+ /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
+ /// handle these error conditions.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{Match, dfa::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let text = "foo1 foo12 foo123";
+ /// let matches: Vec<Match> = re.find_iter(text).collect();
+ /// assert_eq!(matches, vec![
+ /// Match::must(0, 0..4),
+ /// Match::must(0, 5..10),
+ /// Match::must(0, 11..17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
+ &'r self,
+ input: I,
+ ) -> FindMatches<'r, 'h, A> {
+ let it = iter::Searcher::new(input.into());
+ FindMatches { re: self, it }
+ }
+}
+
+/// Lower level fallible search routines that permit controlling where the
+/// search starts and ends in a particular sequence.
+impl<A: Automaton> Regex<A> {
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// This is like [`Regex::find`] but with two differences:
+ ///
+ /// 1. It is not generic over `Into<Input>` and instead accepts a
+ /// `&Input`. This permits reusing the same `Input` for multiple searches
+ /// without needing to create a new one. This _may_ help with latency.
+ /// 2. It returns an error if the search could not complete where as
+ /// [`Regex::find`] will panic.
+ ///
+ /// # Errors
+ ///
+ /// This routine errors if the search could not complete. This can occur
+ /// in the following circumstances:
+ ///
+ /// * The configuration of the DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the DFA quitting.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
+ /// exists or not.
+ #[inline]
+ pub fn try_search(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, MatchError> {
+ let (fwd, rev) = (self.forward(), self.reverse());
+ let end = match fwd.try_search_fwd(input)? {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // This special cases an empty match at the beginning of the search. If
+ // our end matches our start, then since a reverse DFA can't match past
+ // the start, it must follow that our starting position is also our end
+ // position. So short circuit and skip the reverse search.
+ if input.start() == end.offset() {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ end.offset()..end.offset(),
+ )));
+ }
+ // We can also skip the reverse search if we know our search was
+ // anchored. This occurs either when the input config is anchored or
+ // when we know the regex itself is anchored. In this case, we know the
+ // start of the match, if one is found, must be the start of the
+ // search.
+ if self.is_anchored(input) {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ input.start()..end.offset(),
+ )));
+ }
+ // N.B. I have tentatively convinced myself that it isn't necessary
+ // to specify the specific pattern for the reverse search since the
+ // reverse search will always find the same pattern to match as the
+ // forward search. But I lack a rigorous proof. Why not just provide
+ // the pattern anyway? Well, if it is needed, then leaving it out
+ // gives us a chance to find a witness. (Also, if we don't need to
+ // specify the pattern, then we don't need to build the reverse DFA
+ // with 'starts_for_each_pattern' enabled.)
+ //
+ // We also need to be careful to disable 'earliest' for the reverse
+ // search, since it could be enabled for the forward search. In the
+ // reverse case, to satisfy "leftmost" criteria, we need to match
+ // as much as we can. We also need to be careful to make the search
+ // anchored. We don't want the reverse search to report any matches
+ // other than the one beginning at the end of our forward search.
+ let revsearch = input
+ .clone()
+ .span(input.start()..end.offset())
+ .anchored(Anchored::Yes)
+ .earliest(false);
+ let start = rev
+ .try_search_rev(&revsearch)?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
+ }
+
+ /// Returns true if either the given input specifies an anchored search
+ /// or if the underlying DFA is always anchored.
+ fn is_anchored(&self, input: &Input<'_>) -> bool {
+ match input.get_anchored() {
+ Anchored::No => self.forward().is_always_start_anchored(),
+ Anchored::Yes | Anchored::Pattern(_) => true,
+ }
+ }
+}
+
+/// Non-search APIs for querying information about the regex and setting a
+/// prefilter.
+impl<A: Automaton> Regex<A> {
+ /// Return the underlying DFA responsible for forward matching.
+ ///
+ /// This is useful for accessing the underlying DFA and converting it to
+ /// some other format or size. See the [`Builder::build_from_dfas`] docs
+ /// for an example of where this might be useful.
+ pub fn forward(&self) -> &A {
+ &self.forward
+ }
+
+ /// Return the underlying DFA responsible for reverse matching.
+ ///
+ /// This is useful for accessing the underlying DFA and converting it to
+ /// some other format or size. See the [`Builder::build_from_dfas`] docs
+ /// for an example of where this might be useful.
+ pub fn reverse(&self) -> &A {
+ &self.reverse
+ }
+
+ /// Returns the total number of patterns matched by this regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
+ /// assert_eq!(3, re.pattern_len());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_len(&self) -> usize {
+ assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
+ self.forward().pattern_len()
+ }
+}
+
+/// An iterator over all non-overlapping matches for an infallible search.
+///
+/// The iterator yields a [`Match`] value until no more matches could be found.
+/// If the underlying regex engine returns an error, then a panic occurs.
+///
+/// The type parameters are as follows:
+///
+/// * `A` represents the type of the underlying DFA that implements the
+/// [`Automaton`] trait.
+///
+/// The lifetime parameters are as follows:
+///
+/// * `'h` represents the lifetime of the haystack being searched.
+/// * `'r` represents the lifetime of the regex object itself.
+///
+/// This iterator can be created with the [`Regex::find_iter`] method.
+#[derive(Debug)]
+pub struct FindMatches<'r, 'h, A> {
+ re: &'r Regex<A>,
+ it: iter::Searcher<'h>,
+}
+
+impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
+ type Item = Match;
+
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ let FindMatches { re, ref mut it } = *self;
+ it.advance(|input| re.try_search(input))
+ }
+}
+
+/// A builder for a regex based on deterministic finite automatons.
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction, the DFA construction and finally the regex searching
+/// itself. This builder is different from a general purpose regex builder in
+/// that it permits fine grain configuration of the construction process. The
+/// trade off for this is complexity, and the possibility of setting a
+/// configuration that might not make sense. For example, there are two
+/// different UTF-8 modes:
+///
+/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
+/// whether the pattern itself can contain sub-expressions that match invalid
+/// UTF-8.
+/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
+/// how the regex iterators themselves advance the starting position of the
+/// next search when a match with zero length is found.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// Internally, building a regex requires building two DFAs, where one is
+/// responsible for finding the end of a match and the other is responsible
+/// for finding the start of a match. If you only need to detect whether
+/// something matched, or only the end of a match, then you should use a
+/// [`dense::Builder`] to construct a single DFA, which is cheaper than
+/// building two DFAs.
+///
+/// # Build methods
+///
+/// This builder has a few "build" methods. In general, it's the result of
+/// combining the following parameters:
+///
+/// * Building one or many regexes.
+/// * Building a regex with dense or sparse DFAs.
+///
+/// The simplest "build" method is [`Builder::build`]. It accepts a single
+/// pattern and builds a dense DFA using `usize` for the state identifier
+/// representation.
+///
+/// The most general "build" method is [`Builder::build_many`], which permits
+/// building a regex that searches for multiple patterns simultaneously while
+/// using a specific state identifier representation.
+///
+/// The most flexible "build" method, but hardest to use, is
+/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
+/// just a pair of DFAs, and this method allows you to specify those DFAs
+/// exactly.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax and the regex
+/// itself. This is generally what you want for matching on arbitrary bytes.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
+/// };
+///
+/// let re = Regex::builder()
+/// .syntax(syntax::Config::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(Match::must(0, 1..9));
+/// let got = re.find(haystack);
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ #[cfg(feature = "dfa-build")]
+ dfa: dense::Builder,
+}
+
+impl Builder {
+ /// Create a new regex builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ #[cfg(feature = "dfa-build")]
+ dfa: dense::Builder::new(),
+ }
+ }
+
+ /// Build a regex from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a regex from the given pattern using sparse DFAs.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build_sparse(
+ &self,
+ pattern: &str,
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
+ self.build_many_sparse(&[pattern])
+ }
+
+ /// Build a regex from the given patterns.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ let forward = self.dfa.build_many(patterns)?;
+ let reverse = self
+ .dfa
+ .clone()
+ .configure(
+ dense::Config::new()
+ .prefilter(None)
+ .specialize_start_states(false)
+ .start_kind(StartKind::Anchored)
+ .match_kind(MatchKind::All),
+ )
+ .thompson(crate::nfa::thompson::Config::new().reverse(true))
+ .build_many(patterns)?;
+ Ok(self.build_from_dfas(forward, reverse))
+ }
+
+ /// Build a sparse regex from the given patterns.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn build_many_sparse<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
+ let re = self.build_many(patterns)?;
+ let forward = re.forward().to_sparse()?;
+ let reverse = re.reverse().to_sparse()?;
+ Ok(self.build_from_dfas(forward, reverse))
+ }
+
+ /// Build a regex from its component forward and reverse DFAs.
+ ///
+ /// This is useful when deserializing a regex from some arbitrary
+ /// memory region. This is also useful for building regexes from other
+ /// types of DFAs.
+ ///
+ /// If you're building the DFAs from scratch instead of building new DFAs
+ /// from other DFAs, then you'll need to make sure that the reverse DFA is
+ /// configured correctly to match the intended semantics. Namely:
+ ///
+ /// * It should be anchored.
+ /// * It should use [`MatchKind::All`] semantics.
+ /// * It should match in reverse.
+ /// * Otherwise, its configuration should match the forward DFA.
+ ///
+ /// If these conditions aren't satisfied, then the behavior of searches is
+ /// unspecified.
+ ///
+ /// Note that when using this constructor, no configuration is applied.
+ /// Since this routine provides the DFAs to the builder, there is no
+ /// opportunity to apply other configuration options.
+ ///
+ /// # Example
+ ///
+ /// This example is a bit a contrived. The usual use of these methods
+ /// would involve serializing `initial_re` somewhere and then deserializing
+ /// it later to build a regex. But in this case, we do everything in
+ /// memory.
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
+ /// let re = Regex::builder().build_from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// This example shows how to build a `Regex` that uses sparse DFAs instead
+ /// of dense DFAs without using one of the convenience `build_sparse`
+ /// routines:
+ ///
+ /// ```
+ /// use regex_automata::dfa::regex::Regex;
+ ///
+ /// let initial_re = Regex::new("foo[0-9]+")?;
+ /// assert_eq!(true, initial_re.is_match(b"foo123"));
+ ///
+ /// let fwd = initial_re.forward().to_sparse()?;
+ /// let rev = initial_re.reverse().to_sparse()?;
+ /// let re = Regex::builder().build_from_dfas(fwd, rev);
+ /// assert_eq!(true, re.is_match(b"foo123"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_dfas<A: Automaton>(
+ &self,
+ forward: A,
+ reverse: A,
+ ) -> Regex<A> {
+ Regex { forward, reverse }
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`syntax::Config`](crate::util::syntax::Config).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::Config,
+ ) -> &mut Builder {
+ self.dfa.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like whether additional time should be
+ /// spent shrinking the size of the NFA.
+ #[cfg(all(feature = "syntax", feature = "dfa-build"))]
+ pub fn thompson(
+ &mut self,
+ config: crate::nfa::thompson::Config,
+ ) -> &mut Builder {
+ self.dfa.thompson(config);
+ self
+ }
+
+ /// Set the dense DFA compilation configuration for this builder using
+ /// [`dense::Config`](dense::Config).
+ ///
+ /// This permits setting things like whether the underlying DFAs should
+ /// be minimized.
+ #[cfg(feature = "dfa-build")]
+ pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
+ self.dfa.configure(config);
+ self
+ }
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/remapper.rs b/third_party/rust/regex-automata/src/dfa/remapper.rs
new file mode 100644
index 0000000000..6e49646721
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/remapper.rs
@@ -0,0 +1,242 @@
+use alloc::vec::Vec;
+
+use crate::util::primitives::StateID;
+
+/// Remappable is a tightly coupled abstraction that facilitates remapping
+/// state identifiers in DFAs.
+///
+/// The main idea behind remapping state IDs is that DFAs often need to check
+/// if a certain state is a "special" state of some kind (like a match state)
+/// during a search. Since this is extremely perf critical code, we want this
+/// check to be as fast as possible. Partitioning state IDs into, for example,
+/// into "non-match" and "match" states means one can tell if a state is a
+/// match state via a simple comparison of the state ID.
+///
+/// The issue is that during the DFA construction process, it's not
+/// particularly easy to partition the states. Instead, the simplest thing is
+/// to often just do a pass over all of the states and shuffle them into their
+/// desired partitionings. To do that, we need a mechanism for swapping states.
+/// Hence, this abstraction.
+///
+/// Normally, for such little code, I would just duplicate it. But this is a
+/// key optimization and the implementation is a bit subtle. So the abstraction
+/// is basically a ham-fisted attempt at DRY. The only place we use this is in
+/// the dense and one-pass DFAs.
+///
+/// See also src/dfa/special.rs for a more detailed explanation of how dense
+/// DFAs are partitioned.
+pub(super) trait Remappable: core::fmt::Debug {
+ /// Return the total number of states.
+ fn state_len(&self) -> usize;
+ /// Return the power-of-2 exponent that yields the stride. The pertinent
+ /// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride.
+ fn stride2(&self) -> usize;
+ /// Swap the states pointed to by the given IDs. The underlying finite
+ /// state machine should be mutated such that all of the transitions in
+ /// `id1` are now in the memory region where the transitions for `id2`
+ /// were, and all of the transitions in `id2` are now in the memory region
+ /// where the transitions for `id1` were.
+ ///
+ /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
+ ///
+ /// It is expected that, after calling this, the underlying value will be
+ /// left in an inconsistent state, since any other transitions pointing to,
+ /// e.g., `id1` need to be updated to point to `id2`, since that's where
+ /// `id1` moved to.
+ ///
+ /// In order to "fix" the underlying inconsistent state, a `Remapper`
+ /// should be used to guarantee that `remap` is called at the appropriate
+ /// time.
+ fn swap_states(&mut self, id1: StateID, id2: StateID);
+ /// This must remap every single state ID in the underlying value according
+ /// to the function given. For example, in a DFA, this should remap every
+ /// transition and every starting state ID.
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID);
+}
+
+/// Remapper is an abstraction the manages the remapping of state IDs in a
+/// finite state machine. This is useful when one wants to shuffle states into
+/// different positions in the machine.
+///
+/// One of the key complexities this manages is the ability to correctly move
+/// one state multiple times.
+///
+/// Once shuffling is complete, `remap` must be called, which will rewrite
+/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
+/// will almost certainly result in a corrupt machine.
+#[derive(Debug)]
+pub(super) struct Remapper {
+ /// A map from the index of a state to its pre-multiplied identifier.
+ ///
+ /// When a state is swapped with another, then their corresponding
+ /// locations in this map are also swapped. Thus, its new position will
+ /// still point to its old pre-multiplied StateID.
+ ///
+ /// While there is a bit more to it, this then allows us to rewrite the
+ /// state IDs in a DFA's transition table in a single pass. This is done
+ /// by iterating over every ID in this map, then iterating over each
+ /// transition for the state at that ID and re-mapping the transition from
+ /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
+ /// in this map where `old_id` *started*, and set it to where it ended up
+ /// after all swaps have been completed.
+ map: Vec<StateID>,
+ /// A mapper from state index to state ID (and back).
+ idxmap: IndexMapper,
+}
+
+impl Remapper {
+ /// Create a new remapper from the given remappable implementation. The
+ /// remapper can then be used to swap states. The remappable value given
+ /// here must the same one given to `swap` and `remap`.
+ pub(super) fn new(r: &impl Remappable) -> Remapper {
+ let idxmap = IndexMapper { stride2: r.stride2() };
+ let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect();
+ Remapper { map, idxmap }
+ }
+
+ /// Swap two states. Once this is called, callers must follow through to
+ /// call `remap`, or else it's possible for the underlying remappable
+ /// value to be in a corrupt state.
+ pub(super) fn swap(
+ &mut self,
+ r: &mut impl Remappable,
+ id1: StateID,
+ id2: StateID,
+ ) {
+ if id1 == id2 {
+ return;
+ }
+ r.swap_states(id1, id2);
+ self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2));
+ }
+
+ /// Complete the remapping process by rewriting all state IDs in the
+ /// remappable value according to the swaps performed.
+ pub(super) fn remap(mut self, r: &mut impl Remappable) {
+ // Update the map to account for states that have been swapped
+ // multiple times. For example, if (A, C) and (C, G) are swapped, then
+ // transitions previously pointing to A should now point to G. But if
+ // we don't update our map, they will erroneously be set to C. All we
+ // do is follow the swaps in our map until we see our original state
+ // ID.
+ //
+ // The intuition here is to think about how changes are made to the
+ // map: only through pairwise swaps. That means that starting at any
+ // given state, it is always possible to find the loop back to that
+ // state by following the swaps represented in the map (which might be
+ // 0 swaps).
+ //
+ // We are also careful to clone the map before starting in order to
+ // freeze it. We use the frozen map to find our loops, since we need to
+ // update our map as well. Without freezing it, our updates could break
+ // the loops referenced above and produce incorrect results.
+ let oldmap = self.map.clone();
+ for i in 0..r.state_len() {
+ let cur_id = self.idxmap.to_state_id(i);
+ let mut new_id = oldmap[i];
+ if cur_id == new_id {
+ continue;
+ }
+ loop {
+ let id = oldmap[self.idxmap.to_index(new_id)];
+ if cur_id == id {
+ self.map[i] = new_id;
+ break;
+ }
+ new_id = id;
+ }
+ }
+ r.remap(|next| self.map[self.idxmap.to_index(next)]);
+ }
+}
+
+/// A simple type for mapping between state indices and state IDs.
+///
+/// The reason why this exists is because state IDs are "premultiplied." That
+/// is, in order to get to the transitions for a particular state, one need
+/// only use the state ID as-is, instead of having to multiple it by transition
+/// table's stride.
+///
+/// The downside of this is that it's inconvenient to map between state IDs
+/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
+/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`,
+/// `2`, `3`, etc.
+///
+/// Since our state IDs are premultiplied, we can convert back-and-forth
+/// between IDs and indices by simply unmultiplying the IDs and multiplying the
+/// indices.
+#[derive(Debug)]
+struct IndexMapper {
+ /// The power of 2 corresponding to the stride of the corresponding
+ /// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
+ /// stride2' pre-multiplies an index to an ID.
+ stride2: usize,
+}
+
+impl IndexMapper {
+ /// Convert a state ID to a state index.
+ fn to_index(&self, id: StateID) -> usize {
+ id.as_usize() >> self.stride2
+ }
+
+ /// Convert a state index to a state ID.
+ fn to_state_id(&self, index: usize) -> StateID {
+ // CORRECTNESS: If the given index is not valid, then it is not
+ // required for this to panic or return a valid state ID. We'll "just"
+ // wind up with panics or silent logic errors at some other point.
+ StateID::new_unchecked(index << self.stride2)
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+mod dense {
+ use crate::{dfa::dense::OwnedDFA, util::primitives::StateID};
+
+ use super::Remappable;
+
+ impl Remappable for OwnedDFA {
+ fn state_len(&self) -> usize {
+ OwnedDFA::state_len(self)
+ }
+
+ fn stride2(&self) -> usize {
+ OwnedDFA::stride2(self)
+ }
+
+ fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ OwnedDFA::swap_states(self, id1, id2)
+ }
+
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ OwnedDFA::remap(self, map)
+ }
+ }
+}
+
+#[cfg(feature = "dfa-onepass")]
+mod onepass {
+ use crate::{dfa::onepass::DFA, util::primitives::StateID};
+
+ use super::Remappable;
+
+ impl Remappable for DFA {
+ fn state_len(&self) -> usize {
+ DFA::state_len(self)
+ }
+
+ fn stride2(&self) -> usize {
+ // We don't do pre-multiplication for the one-pass DFA, so
+ // returning 0 has the effect of making state IDs and state indices
+ // equivalent.
+ 0
+ }
+
+ fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ DFA::swap_states(self, id1, id2)
+ }
+
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ DFA::remap(self, map)
+ }
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/search.rs b/third_party/rust/regex-automata/src/dfa/search.rs
new file mode 100644
index 0000000000..8c012a5944
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/search.rs
@@ -0,0 +1,654 @@
+use crate::{
+ dfa::{
+ accel,
+ automaton::{Automaton, OverlappingState},
+ },
+ util::{
+ prefilter::Prefilter,
+ primitives::StateID,
+ search::{Anchored, HalfMatch, Input, Span},
+ },
+ MatchError,
+};
+
+#[inline(never)]
+pub fn find_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+) -> Result<Option<HalfMatch>, MatchError> {
+ if input.is_done() {
+ return Ok(None);
+ }
+ let pre = if input.get_anchored().is_anchored() {
+ None
+ } else {
+ dfa.get_prefilter()
+ };
+ // Searching with a pattern ID is always anchored, so we should never use
+ // a prefilter.
+ if pre.is_some() {
+ if input.get_earliest() {
+ find_fwd_imp(dfa, input, pre, true)
+ } else {
+ find_fwd_imp(dfa, input, pre, false)
+ }
+ } else {
+ if input.get_earliest() {
+ find_fwd_imp(dfa, input, None, true)
+ } else {
+ find_fwd_imp(dfa, input, None, false)
+ }
+ }
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_fwd_imp<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ earliest: bool,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.universal_start_state(Anchored::No).is_some();
+ let mut mat = None;
+ let mut sid = init_fwd(dfa, input)?;
+ let mut at = input.start();
+ // This could just be a closure, but then I think it would be unsound
+ // because it would need to be safe to invoke. This way, the lack of safety
+ // is clearer in the code below.
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_unchecked($sid, byte)
+ }};
+ }
+
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ // If a prefilter doesn't report false positives, then we don't need to
+ // touch the DFA at all. However, since all matches include the pattern
+ // ID, and the prefilter infrastructure doesn't report pattern IDs, we
+ // limit this optimization to cases where there is exactly one pattern.
+ // In that case, any match must be the 0th pattern.
+ match pre.find(input.haystack(), span) {
+ None => return Ok(mat),
+ Some(ref span) => {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(dfa, &input, at)?;
+ }
+ }
+ }
+ }
+ while at < input.end() {
+ // SAFETY: There are two safety invariants we need to uphold here in
+ // the loops below: that 'sid' and 'prev_sid' are valid state IDs
+ // for this DFA, and that 'at' is a valid index into 'haystack'.
+ // For the former, we rely on the invariant that next_state* and
+ // start_state_forward always returns a valid state ID (given a valid
+ // state ID in the former case). For the latter safety invariant, we
+ // always guard unchecked access with a check that 'at' is less than
+ // 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
+ // ensure that 'at' is always in bounds.
+ //
+ // PERF: See a similar comment in src/hybrid/search.rs that justifies
+ // this extra work to make the search loop fast. The same reasoning and
+ // benchmarks apply here.
+ let mut prev_sid;
+ while at < input.end() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at += 1;
+
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at += 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at += 1;
+ }
+ if dfa.is_special_state(sid) {
+ if dfa.is_start_state(sid) {
+ if let Some(ref pre) = pre {
+ let span = Span::from(at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(mat),
+ Some(ref span) => {
+ // We want to skip any update to 'at' below
+ // at the end of this iteration and just
+ // jump immediately back to the next state
+ // transition at the leading position of the
+ // candidate match.
+ //
+ // ... but only if we actually made progress
+ // with our prefilter, otherwise if the start
+ // state has a self-loop, we can get stuck.
+ if span.start > at {
+ at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(dfa, &input, at)?;
+ }
+ continue;
+ }
+ }
+ }
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_fwd(needles, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
+ }
+ } else if dfa.is_match_state(sid) {
+ let pattern = dfa.match_pattern(sid, 0);
+ mat = Some(HalfMatch::new(pattern, at));
+ if earliest {
+ return Ok(mat);
+ }
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_fwd(needles, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
+ }
+ } else if dfa.is_accel_state(sid) {
+ let needs = dfa.accelerator(sid);
+ at = accel::find_fwd(needs, input.haystack(), at + 1)
+ .unwrap_or(input.end());
+ continue;
+ } else if dfa.is_dead_state(sid) {
+ return Ok(mat);
+ } else {
+ // It's important that this is a debug_assert, since this can
+ // actually be tripped even if DFA::from_bytes succeeds and
+ // returns a supposedly valid DFA.
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(input.haystack()[at], at));
+ }
+ }
+ at += 1;
+ }
+ eoi_fwd(dfa, input, &mut sid, &mut mat)?;
+ Ok(mat)
+}
+
+#[inline(never)]
+pub fn find_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+) -> Result<Option<HalfMatch>, MatchError> {
+ if input.is_done() {
+ return Ok(None);
+ }
+ if input.get_earliest() {
+ find_rev_imp(dfa, input, true)
+ } else {
+ find_rev_imp(dfa, input, false)
+ }
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_rev_imp<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ earliest: bool,
+) -> Result<Option<HalfMatch>, MatchError> {
+ let mut mat = None;
+ let mut sid = init_rev(dfa, input)?;
+ // In reverse search, the loop below can't handle the case of searching an
+ // empty slice. Ideally we could write something congruent to the forward
+ // search, i.e., 'while at >= start', but 'start' might be 0. Since we use
+ // an unsigned offset, 'at >= 0' is trivially always true. We could avoid
+ // this extra case handling by using a signed offset, but Rust makes it
+ // annoying to do. So... We just handle the empty case separately.
+ if input.start() == input.end() {
+ eoi_rev(dfa, input, &mut sid, &mut mat)?;
+ return Ok(mat);
+ }
+
+ let mut at = input.end() - 1;
+ macro_rules! next_unchecked {
+ ($sid:expr, $at:expr) => {{
+ let byte = *input.haystack().get_unchecked($at);
+ dfa.next_state_unchecked($sid, byte)
+ }};
+ }
+ loop {
+ // SAFETY: See comments in 'find_fwd' for a safety argument.
+ let mut prev_sid;
+ while at >= input.start() {
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid)
+ || at <= input.start().saturating_add(3)
+ {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at -= 1;
+
+ prev_sid = unsafe { next_unchecked!(sid, at) };
+ if dfa.is_special_state(prev_sid) {
+ core::mem::swap(&mut prev_sid, &mut sid);
+ break;
+ }
+ at -= 1;
+
+ sid = unsafe { next_unchecked!(prev_sid, at) };
+ if dfa.is_special_state(sid) {
+ break;
+ }
+ at -= 1;
+ }
+ if dfa.is_special_state(sid) {
+ if dfa.is_start_state(sid) {
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_rev(needles, input.haystack(), at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
+ }
+ } else if dfa.is_match_state(sid) {
+ let pattern = dfa.match_pattern(sid, 0);
+ // Since reverse searches report the beginning of a match
+ // and the beginning is inclusive (not exclusive like the
+ // end of a match), we add 1 to make it inclusive.
+ mat = Some(HalfMatch::new(pattern, at + 1));
+ if earliest {
+ return Ok(mat);
+ }
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ at = accel::find_rev(needles, input.haystack(), at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
+ }
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ at = accel::find_rev(needles, input.haystack(), at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
+ } else if dfa.is_dead_state(sid) {
+ return Ok(mat);
+ } else {
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(input.haystack()[at], at));
+ }
+ }
+ if at == input.start() {
+ break;
+ }
+ at -= 1;
+ }
+ eoi_rev(dfa, input, &mut sid, &mut mat)?;
+ Ok(mat)
+}
+
+#[inline(never)]
+pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let pre = if input.get_anchored().is_anchored() {
+ None
+ } else {
+ dfa.get_prefilter()
+ };
+ if pre.is_some() {
+ find_overlapping_fwd_imp(dfa, input, pre, state)
+ } else {
+ find_overlapping_fwd_imp(dfa, input, None, state)
+ }
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ pre: Option<&'_ Prefilter>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ // See 'prefilter_restart' docs for explanation.
+ let universal_start = dfa.universal_start_state(Anchored::No).is_some();
+ let mut sid = match state.id {
+ None => {
+ state.at = input.start();
+ init_fwd(dfa, input)?
+ }
+ Some(sid) => {
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ }
+ }
+ // Once we've reported all matches at a given position, we need to
+ // advance the search to the next position.
+ state.at += 1;
+ if state.at > input.end() {
+ return Ok(());
+ }
+ sid
+ }
+ };
+
+ // NOTE: We don't optimize the crap out of this routine primarily because
+ // it seems like most find_overlapping searches will have higher match
+ // counts, and thus, throughput is perhaps not as important. But if you
+ // have a use case for something faster, feel free to file an issue.
+ while state.at < input.end() {
+ sid = dfa.next_state(sid, input.haystack()[state.at]);
+ if dfa.is_special_state(sid) {
+ state.id = Some(sid);
+ if dfa.is_start_state(sid) {
+ if let Some(ref pre) = pre {
+ let span = Span::from(state.at..input.end());
+ match pre.find(input.haystack(), span) {
+ None => return Ok(()),
+ Some(ref span) => {
+ if span.start > state.at {
+ state.at = span.start;
+ if !universal_start {
+ sid = prefilter_restart(
+ dfa, &input, state.at,
+ )?;
+ }
+ continue;
+ }
+ }
+ }
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ state.at = accel::find_fwd(
+ needles,
+ input.haystack(),
+ state.at + 1,
+ )
+ .unwrap_or(input.end());
+ continue;
+ }
+ } else if dfa.is_match_state(sid) {
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ } else if dfa.is_accel_state(sid) {
+ let needs = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ state.at =
+ accel::find_fwd(needs, input.haystack(), state.at + 1)
+ .unwrap_or(input.end());
+ continue;
+ } else if dfa.is_dead_state(sid) {
+ return Ok(());
+ } else {
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
+ }
+ }
+ state.at += 1;
+ }
+
+ let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
+ }
+ result
+}
+
+#[inline(never)]
+pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ state: &mut OverlappingState,
+) -> Result<(), MatchError> {
+ state.mat = None;
+ if input.is_done() {
+ return Ok(());
+ }
+ let mut sid = match state.id {
+ None => {
+ let sid = init_rev(dfa, input)?;
+ state.id = Some(sid);
+ if input.start() == input.end() {
+ state.rev_eoi = true;
+ } else {
+ state.at = input.end() - 1;
+ }
+ sid
+ }
+ Some(sid) => {
+ if let Some(match_index) = state.next_match_index {
+ let match_len = dfa.match_len(sid);
+ if match_index < match_len {
+ state.next_match_index = Some(match_index + 1);
+ let pattern = dfa.match_pattern(sid, match_index);
+ state.mat = Some(HalfMatch::new(pattern, state.at));
+ return Ok(());
+ }
+ }
+ // Once we've reported all matches at a given position, we need
+ // to advance the search to the next position. However, if we've
+ // already followed the EOI transition, then we know we're done
+ // with the search and there cannot be any more matches to report.
+ if state.rev_eoi {
+ return Ok(());
+ } else if state.at == input.start() {
+ // At this point, we should follow the EOI transition. This
+ // will cause us the skip the main loop below and fall through
+ // to the final 'eoi_rev' transition.
+ state.rev_eoi = true;
+ } else {
+ // We haven't hit the end of the search yet, so move on.
+ state.at -= 1;
+ }
+ sid
+ }
+ };
+ while !state.rev_eoi {
+ sid = dfa.next_state(sid, input.haystack()[state.at]);
+ if dfa.is_special_state(sid) {
+ state.id = Some(sid);
+ if dfa.is_start_state(sid) {
+ if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ state.at =
+ accel::find_rev(needles, input.haystack(), state.at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
+ }
+ } else if dfa.is_match_state(sid) {
+ state.next_match_index = Some(1);
+ let pattern = dfa.match_pattern(sid, 0);
+ state.mat = Some(HalfMatch::new(pattern, state.at + 1));
+ return Ok(());
+ } else if dfa.is_accel_state(sid) {
+ let needles = dfa.accelerator(sid);
+ // If the accelerator returns nothing, why don't we quit the
+ // search? Well, if the accelerator doesn't find anything, that
+ // doesn't mean we don't have a match. It just means that we
+ // can't leave the current state given one of the 255 possible
+ // byte values. However, there might be an EOI transition. So
+ // we set 'at' to the end of the haystack, which will cause
+ // this loop to stop and fall down into the EOI transition.
+ state.at =
+ accel::find_rev(needles, input.haystack(), state.at)
+ .map(|i| i + 1)
+ .unwrap_or(input.start());
+ } else if dfa.is_dead_state(sid) {
+ return Ok(());
+ } else {
+ debug_assert!(dfa.is_quit_state(sid));
+ return Err(MatchError::quit(
+ input.haystack()[state.at],
+ state.at,
+ ));
+ }
+ }
+ if state.at == input.start() {
+ break;
+ }
+ state.at -= 1;
+ }
+
+ let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
+ state.rev_eoi = true;
+ state.id = Some(sid);
+ if state.mat.is_some() {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ state.next_match_index = Some(1);
+ }
+ result
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn init_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+) -> Result<StateID, MatchError> {
+ let sid = dfa.start_state_forward(input)?;
+ // Start states can never be match states, since all matches are delayed
+ // by 1 byte.
+ debug_assert!(!dfa.is_match_state(sid));
+ Ok(sid)
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn init_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+) -> Result<StateID, MatchError> {
+ let sid = dfa.start_state_reverse(input)?;
+ // Start states can never be match states, since all matches are delayed
+ // by 1 byte.
+ debug_assert!(!dfa.is_match_state(sid));
+ Ok(sid)
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn eoi_fwd<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ sid: &mut StateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ match input.haystack().get(sp.end) {
+ Some(&b) => {
+ *sid = dfa.next_state(*sid, b);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.end));
+ } else if dfa.is_quit_state(*sid) {
+ return Err(MatchError::quit(b, sp.end));
+ }
+ }
+ None => {
+ *sid = dfa.next_eoi_state(*sid);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
+ }
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!dfa.is_quit_state(*sid));
+ }
+ }
+ Ok(())
+}
+
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn eoi_rev<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ sid: &mut StateID,
+ mat: &mut Option<HalfMatch>,
+) -> Result<(), MatchError> {
+ let sp = input.get_span();
+ if sp.start > 0 {
+ let byte = input.haystack()[sp.start - 1];
+ *sid = dfa.next_state(*sid, byte);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, sp.start));
+ } else if dfa.is_quit_state(*sid) {
+ return Err(MatchError::quit(byte, sp.start - 1));
+ }
+ } else {
+ *sid = dfa.next_eoi_state(*sid);
+ if dfa.is_match_state(*sid) {
+ let pattern = dfa.match_pattern(*sid, 0);
+ *mat = Some(HalfMatch::new(pattern, 0));
+ }
+ // N.B. We don't have to check 'is_quit' here because the EOI
+ // transition can never lead to a quit state.
+ debug_assert!(!dfa.is_quit_state(*sid));
+ }
+ Ok(())
+}
+
+/// Re-compute the starting state that a DFA should be in after finding a
+/// prefilter candidate match at the position `at`.
+///
+/// The function with the same name has a bit more docs in hybrid/search.rs.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn prefilter_restart<A: Automaton + ?Sized>(
+ dfa: &A,
+ input: &Input<'_>,
+ at: usize,
+) -> Result<StateID, MatchError> {
+ let mut input = input.clone();
+ input.set_start(at);
+ init_fwd(dfa, &input)
+}
diff --git a/third_party/rust/regex-automata/src/dfa/sparse.rs b/third_party/rust/regex-automata/src/dfa/sparse.rs
new file mode 100644
index 0000000000..5d8ec23408
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/sparse.rs
@@ -0,0 +1,2656 @@
+/*!
+Types and routines specific to sparse DFAs.
+
+This module is the home of [`sparse::DFA`](DFA).
+
+Unlike the [`dense`](super::dense) module, this module does not contain a
+builder or configuration specific for sparse DFAs. Instead, the intended
+way to build a sparse DFA is either by using a default configuration with
+its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the
+construction of a dense DFA with [`dense::Builder`](super::dense::Builder)
+and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For
+example, this configures a sparse DFA to do an overlapping search:
+
+```
+use regex_automata::{
+ dfa::{Automaton, OverlappingState, dense},
+ HalfMatch, Input, MatchKind,
+};
+
+let dense_re = dense::Builder::new()
+ .configure(dense::Config::new().match_kind(MatchKind::All))
+ .build(r"Samwise|Sam")?;
+let sparse_re = dense_re.to_sparse()?;
+
+// Setup our haystack and initial start state.
+let input = Input::new("Samwise");
+let mut state = OverlappingState::start();
+
+// First, 'Sam' will match.
+sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
+assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match());
+
+// And now 'Samwise' will match.
+sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
+assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match());
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+*/
+
+#[cfg(feature = "dfa-build")]
+use core::iter;
+use core::{
+ convert::{TryFrom, TryInto},
+ fmt,
+ mem::size_of,
+};
+
+#[cfg(feature = "dfa-build")]
+use alloc::{vec, vec::Vec};
+
+#[cfg(feature = "dfa-build")]
+use crate::dfa::dense::{self, BuildError};
+use crate::{
+ dfa::{
+ automaton::{fmt_state_indicator, Automaton},
+ dense::Flags,
+ special::Special,
+ StartKind, DEAD,
+ },
+ util::{
+ alphabet::{ByteClasses, ByteSet},
+ escape::DebugByte,
+ int::{Pointer, Usize, U16, U32},
+ prefilter::Prefilter,
+ primitives::{PatternID, StateID},
+ search::{Anchored, Input, MatchError},
+ start::{Start, StartByteMap},
+ wire::{self, DeserializeError, Endian, SerializeError},
+ },
+};
+
+const LABEL: &str = "rust-regex-automata-dfa-sparse";
+const VERSION: u32 = 2;
+
+/// A sparse deterministic finite automaton (DFA) with variable sized states.
+///
+/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses
+/// a more space efficient representation for its transitions. Consequently,
+/// sparse DFAs may use much less memory than dense DFAs, but this comes at a
+/// price. In particular, reading the more space efficient transitions takes
+/// more work, and consequently, searching using a sparse DFA is typically
+/// slower than a dense DFA.
+///
+/// A sparse DFA can be built using the default configuration via the
+/// [`DFA::new`] constructor. Otherwise, one can configure various aspects
+/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder),
+/// and then convert a dense DFA to a sparse DFA using
+/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse).
+///
+/// In general, a sparse DFA supports all the same search operations as a dense
+/// DFA.
+///
+/// Making the choice between a dense and sparse DFA depends on your specific
+/// work load. If you can sacrifice a bit of search time performance, then a
+/// sparse DFA might be the best choice. In particular, while sparse DFAs are
+/// probably always slower than dense DFAs, you may find that they are easily
+/// fast enough for your purposes!
+///
+/// # Type parameters
+///
+/// A `DFA` has one type parameter, `T`, which is used to represent the parts
+/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
+///
+/// # The `Automaton` trait
+///
+/// This type implements the [`Automaton`] trait, which means it can be used
+/// for searching. For example:
+///
+/// ```
+/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let expected = Some(HalfMatch::must(0, 8));
+/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct DFA<T> {
+ // When compared to a dense DFA, a sparse DFA *looks* a lot simpler
+ // representation-wise. In reality, it is perhaps more complicated. Namely,
+ // in a dense DFA, all information needs to be very cheaply accessible
+ // using only state IDs. In a sparse DFA however, each state uses a
+ // variable amount of space because each state encodes more information
+ // than just its transitions. Each state also includes an accelerator if
+ // one exists, along with the matching pattern IDs if the state is a match
+ // state.
+ //
+ // That is, a lot of the complexity is pushed down into how each state
+ // itself is represented.
+ tt: Transitions<T>,
+ st: StartTable<T>,
+ special: Special,
+ pre: Option<Prefilter>,
+ quitset: ByteSet,
+ flags: Flags,
+}
+
+#[cfg(feature = "dfa-build")]
+impl DFA<Vec<u8>> {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding sparse DFA.
+ ///
+ /// If you want a non-default configuration, then use
+ /// the [`dense::Builder`](crate::dfa::dense::Builder)
+ /// to set your own configuration, and then call
+ /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
+ /// a sparse DFA.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
+ ///
+ /// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 11));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> {
+ dense::Builder::new()
+ .build(pattern)
+ .and_then(|dense| dense.to_sparse())
+ }
+
+ /// Parse the given regular expressions using a default configuration and
+ /// return the corresponding multi-DFA.
+ ///
+ /// If you want a non-default configuration, then use
+ /// the [`dense::Builder`](crate::dfa::dense::Builder)
+ /// to set your own configuration, and then call
+ /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
+ /// a sparse DFA.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
+ ///
+ /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+ /// let expected = Some(HalfMatch::must(1, 3));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<DFA<Vec<u8>>, BuildError> {
+ dense::Builder::new()
+ .build_many(patterns)
+ .and_then(|dense| dense.to_sparse())
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl DFA<Vec<u8>> {
+ /// Create a new DFA that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse},
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// let dfa = sparse::DFA::always_match()?;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 0));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> {
+ dense::DFA::always_match()?.to_sparse()
+ }
+
+ /// Create a new sparse DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse}, Input};
+ ///
+ /// let dfa = sparse::DFA::never_match()?;
+ /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
+ /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> {
+ dense::DFA::never_match()?.to_sparse()
+ }
+
+ /// The implementation for constructing a sparse DFA from a dense DFA.
+ pub(crate) fn from_dense<T: AsRef<[u32]>>(
+ dfa: &dense::DFA<T>,
+ ) -> Result<DFA<Vec<u8>>, BuildError> {
+ // In order to build the transition table, we need to be able to write
+ // state identifiers for each of the "next" transitions in each state.
+ // Our state identifiers correspond to the byte offset in the
+ // transition table at which the state is encoded. Therefore, we do not
+ // actually know what the state identifiers are until we've allocated
+ // exactly as much space as we need for each state. Thus, construction
+ // of the transition table happens in two passes.
+ //
+ // In the first pass, we fill out the shell of each state, which
+ // includes the transition length, the input byte ranges and
+ // zero-filled space for the transitions and accelerators, if present.
+ // In this first pass, we also build up a map from the state identifier
+ // index of the dense DFA to the state identifier in this sparse DFA.
+ //
+ // In the second pass, we fill in the transitions based on the map
+ // built in the first pass.
+
+ // The capacity given here reflects a minimum. (Well, the true minimum
+ // is likely even bigger, but hopefully this saves a few reallocs.)
+ let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len());
+ // This maps state indices from the dense DFA to StateIDs in the sparse
+ // DFA. We build out this map on the first pass, and then use it in the
+ // second pass to back-fill our transitions.
+ let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()];
+ for state in dfa.states() {
+ let pos = sparse.len();
+
+ remap[dfa.to_index(state.id())] = StateID::new(pos)
+ .map_err(|_| BuildError::too_many_states())?;
+ // zero-filled space for the transition length
+ sparse.push(0);
+ sparse.push(0);
+
+ let mut transition_len = 0;
+ for (unit1, unit2, _) in state.sparse_transitions() {
+ match (unit1.as_u8(), unit2.as_u8()) {
+ (Some(b1), Some(b2)) => {
+ transition_len += 1;
+ sparse.push(b1);
+ sparse.push(b2);
+ }
+ (None, None) => {}
+ (Some(_), None) | (None, Some(_)) => {
+ // can never occur because sparse_transitions never
+ // groups EOI with any other transition.
+ unreachable!()
+ }
+ }
+ }
+ // Add dummy EOI transition. This is never actually read while
+ // searching, but having space equivalent to the total number
+ // of transitions is convenient. Otherwise, we'd need to track
+ // a different number of transitions for the byte ranges as for
+ // the 'next' states.
+ //
+ // N.B. The loop above is not guaranteed to yield the EOI
+ // transition, since it may point to a DEAD state. By putting
+ // it here, we always write the EOI transition, and thus
+ // guarantee that our transition length is >0. Why do we always
+ // need the EOI transition? Because in order to implement
+ // Automaton::next_eoi_state, this lets us just ask for the last
+ // transition. There are probably other/better ways to do this.
+ transition_len += 1;
+ sparse.push(0);
+ sparse.push(0);
+
+ // Check some assumptions about transition length.
+ assert_ne!(
+ transition_len, 0,
+ "transition length should be non-zero",
+ );
+ assert!(
+ transition_len <= 257,
+ "expected transition length {} to be <= 257",
+ transition_len,
+ );
+
+ // Fill in the transition length.
+ // Since transition length is always <= 257, we use the most
+ // significant bit to indicate whether this is a match state or
+ // not.
+ let ntrans = if dfa.is_match_state(state.id()) {
+ transition_len | (1 << 15)
+ } else {
+ transition_len
+ };
+ wire::NE::write_u16(ntrans, &mut sparse[pos..]);
+
+ // zero-fill the actual transitions.
+ // Unwraps are OK since transition_length <= 257 and our minimum
+ // support usize size is 16-bits.
+ let zeros = usize::try_from(transition_len)
+ .unwrap()
+ .checked_mul(StateID::SIZE)
+ .unwrap();
+ sparse.extend(iter::repeat(0).take(zeros));
+
+ // If this is a match state, write the pattern IDs matched by this
+ // state.
+ if dfa.is_match_state(state.id()) {
+ let plen = dfa.match_pattern_len(state.id());
+ // Write the actual pattern IDs with a u32 length prefix.
+ // First, zero-fill space.
+ let mut pos = sparse.len();
+ // Unwraps are OK since it's guaranteed that plen <=
+ // PatternID::LIMIT, which is in turn guaranteed to fit into a
+ // u32.
+ let zeros = size_of::<u32>()
+ .checked_mul(plen)
+ .unwrap()
+ .checked_add(size_of::<u32>())
+ .unwrap();
+ sparse.extend(iter::repeat(0).take(zeros));
+
+ // Now write the length prefix.
+ wire::NE::write_u32(
+ // Will never fail since u32::MAX is invalid pattern ID.
+ // Thus, the number of pattern IDs is representable by a
+ // u32.
+ plen.try_into().expect("pattern ID length fits in u32"),
+ &mut sparse[pos..],
+ );
+ pos += size_of::<u32>();
+
+ // Now write the pattern IDs.
+ for &pid in dfa.pattern_id_slice(state.id()) {
+ pos += wire::write_pattern_id::<wire::NE>(
+ pid,
+ &mut sparse[pos..],
+ );
+ }
+ }
+
+ // And now add the accelerator, if one exists. An accelerator is
+ // at most 4 bytes and at least 1 byte. The first byte is the
+ // length, N. N bytes follow the length. The set of bytes that
+ // follow correspond (exhaustively) to the bytes that must be seen
+ // to leave this state.
+ let accel = dfa.accelerator(state.id());
+ sparse.push(accel.len().try_into().unwrap());
+ sparse.extend_from_slice(accel);
+ }
+
+ let mut new = DFA {
+ tt: Transitions {
+ sparse,
+ classes: dfa.byte_classes().clone(),
+ state_len: dfa.state_len(),
+ pattern_len: dfa.pattern_len(),
+ },
+ st: StartTable::from_dense_dfa(dfa, &remap)?,
+ special: dfa.special().remap(|id| remap[dfa.to_index(id)]),
+ pre: dfa.get_prefilter().map(|p| p.clone()),
+ quitset: dfa.quitset().clone(),
+ flags: dfa.flags().clone(),
+ };
+ // And here's our second pass. Iterate over all of the dense states
+ // again, and update the transitions in each of the states in the
+ // sparse DFA.
+ for old_state in dfa.states() {
+ let new_id = remap[dfa.to_index(old_state.id())];
+ let mut new_state = new.tt.state_mut(new_id);
+ let sparse = old_state.sparse_transitions();
+ for (i, (_, _, next)) in sparse.enumerate() {
+ let next = remap[dfa.to_index(next)];
+ new_state.set_next_at(i, next);
+ }
+ }
+ debug!(
+ "created sparse DFA, memory usage: {} (dense memory usage: {})",
+ new.memory_usage(),
+ dfa.memory_usage(),
+ );
+ Ok(new)
+ }
+}
+
+impl<T: AsRef<[u8]>> DFA<T> {
+ /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
+ /// DFA returned always uses `&[u8]` for its transitions.
+ pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
+ DFA {
+ tt: self.tt.as_ref(),
+ st: self.st.as_ref(),
+ special: self.special,
+ pre: self.pre.clone(),
+ quitset: self.quitset,
+ flags: self.flags,
+ }
+ }
+
+ /// Return an owned version of this sparse DFA. Specifically, the DFA
+ /// returned always uses `Vec<u8>` for its transitions.
+ ///
+ /// Effectively, this returns a sparse DFA whose transitions live on the
+ /// heap.
+ #[cfg(feature = "alloc")]
+ pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> {
+ DFA {
+ tt: self.tt.to_owned(),
+ st: self.st.to_owned(),
+ special: self.special,
+ pre: self.pre.clone(),
+ quitset: self.quitset,
+ flags: self.flags,
+ }
+ }
+
+ /// Returns the starting state configuration for this DFA.
+ ///
+ /// The default is [`StartKind::Both`], which means the DFA supports both
+ /// unanchored and anchored searches. However, this can generally lead to
+ /// bigger DFAs. Therefore, a DFA might be compiled with support for just
+ /// unanchored or anchored searches. In that case, running a search with
+ /// an unsupported configuration will panic.
+ pub fn start_kind(&self) -> StartKind {
+ self.st.kind
+ }
+
+ /// Returns true only if this DFA has starting states for each pattern.
+ ///
+ /// When a DFA has starting states for each pattern, then a search with the
+ /// DFA can be configured to only look for anchored matches of a specific
+ /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can
+ /// accept a [`Anchored::Pattern`] if and only if this method returns true.
+ /// Otherwise, an error will be returned.
+ ///
+ /// Note that if the DFA is empty, this always returns false.
+ pub fn starts_for_each_pattern(&self) -> bool {
+ self.st.pattern_len.is_some()
+ }
+
+ /// Returns the equivalence classes that make up the alphabet for this DFA.
+ ///
+ /// Unless [`dense::Config::byte_classes`] was disabled, it is possible
+ /// that multiple distinct bytes are grouped into the same equivalence
+ /// class if it is impossible for them to discriminate between a match and
+ /// a non-match. This has the effect of reducing the overall alphabet size
+ /// and in turn potentially substantially reducing the size of the DFA's
+ /// transition table.
+ ///
+ /// The downside of using equivalence classes like this is that every state
+ /// transition will automatically use this map to convert an arbitrary
+ /// byte to its corresponding equivalence class. In practice this has a
+ /// negligible impact on performance.
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.tt.classes
+ }
+
+ /// Returns the memory usage, in bytes, of this DFA.
+ ///
+ /// The memory usage is computed based on the number of bytes used to
+ /// represent this DFA.
+ ///
+ /// This does **not** include the stack size used up by this DFA. To
+ /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.tt.memory_usage() + self.st.memory_usage()
+ }
+}
+
+/// Routines for converting a sparse DFA to other representations, such as raw
+/// bytes suitable for persistent storage.
+impl<T: AsRef<[u8]>> DFA<T> {
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
+ /// format.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+ /// serialization methods, this does not add any initial padding to the
+ /// returned bytes. Padding isn't required for sparse DFAs since they have
+ /// no alignment requirements.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using to_bytes_little_endian would work on a little endian target.
+ /// let buf = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "dfa-build")]
+ pub fn to_bytes_little_endian(&self) -> Vec<u8> {
+ self.to_bytes::<wire::LE>()
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
+ /// format.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+ /// serialization methods, this does not add any initial padding to the
+ /// returned bytes. Padding isn't required for sparse DFAs since they have
+ /// no alignment requirements.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using to_bytes_big_endian would work on a big endian target.
+ /// let buf = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "dfa-build")]
+ pub fn to_bytes_big_endian(&self) -> Vec<u8> {
+ self.to_bytes::<wire::BE>()
+ }
+
+ /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
+ /// format.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
+ /// serialization methods, this does not add any initial padding to the
+ /// returned bytes. Padding isn't required for sparse DFAs since they have
+ /// no alignment requirements.
+ ///
+ /// Generally speaking, native endian format should only be used when
+ /// you know that the target you're compiling the DFA for matches the
+ /// endianness of the target on which you're compiling DFA. For example,
+ /// if serialization and deserialization happen in the same process or on
+ /// the same machine. Otherwise, when serializing a DFA for use in a
+ /// portable environment, you'll almost certainly want to serialize _both_
+ /// a little endian and a big endian version and then load the correct one
+ /// based on the target's configuration.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA:
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let buf = original_dfa.to_bytes_native_endian();
+ /// // Even if buf has initial padding, DFA::from_bytes will automatically
+ /// // ignore it.
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "dfa-build")]
+ pub fn to_bytes_native_endian(&self) -> Vec<u8> {
+ self.to_bytes::<wire::NE>()
+ }
+
+ /// The implementation of the public `to_bytes` serialization methods,
+ /// which is generic over endianness.
+ #[cfg(feature = "dfa-build")]
+ fn to_bytes<E: Endian>(&self) -> Vec<u8> {
+ let mut buf = vec![0; self.write_to_len()];
+ // This should always succeed since the only possible serialization
+ // error is providing a buffer that's too small, but we've ensured that
+ // `buf` is big enough here.
+ self.write_to::<E>(&mut buf).unwrap();
+ buf
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in little endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using write_to_little_endian would work on a little endian target.
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_little_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.write_to::<wire::LE>(dst)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in big endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// // N.B. We use native endianness here to make the example work, but
+ /// // using write_to_big_endian would work on a big endian target.
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_big_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.write_to::<wire::BE>(dst)
+ }
+
+ /// Serialize this DFA as raw bytes to the given slice, in native endian
+ /// format. Upon success, the total number of bytes written to `dst` is
+ /// returned.
+ ///
+ /// The written bytes are guaranteed to be deserialized correctly and
+ /// without errors in a semver compatible release of this crate by a
+ /// `DFA`'s deserialization APIs (assuming all other criteria for the
+ /// deserialization APIs has been satisfied):
+ ///
+ /// * [`DFA::from_bytes`]
+ /// * [`DFA::from_bytes_unchecked`]
+ ///
+ /// Generally speaking, native endian format should only be used when
+ /// you know that the target you're compiling the DFA for matches the
+ /// endianness of the target on which you're compiling DFA. For example,
+ /// if serialization and deserialization happen in the same process or on
+ /// the same machine. Otherwise, when serializing a DFA for use in a
+ /// portable environment, you'll almost certainly want to serialize _both_
+ /// a little endian and a big endian version and then load the correct one
+ /// based on the target's configuration.
+ ///
+ /// # Errors
+ ///
+ /// This returns an error if the given destination slice is not big enough
+ /// to contain the full serialized DFA. If an error occurs, then nothing
+ /// is written to `dst`.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize and deserialize a DFA without
+ /// dynamic memory allocation.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Create a 4KB buffer on the stack to store our serialized DFA.
+ /// let mut buf = [0u8; 4 * (1<<10)];
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_native_endian(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ self.write_to::<wire::NE>(dst)
+ }
+
+ /// The implementation of the public `write_to` serialization methods,
+ /// which is generic over endianness.
+ fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let mut nw = 0;
+ nw += wire::write_label(LABEL, &mut dst[nw..])?;
+ nw += wire::write_endianness_check::<E>(&mut dst[nw..])?;
+ nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?;
+ nw += {
+ // Currently unused, intended for future flexibility
+ E::write_u32(0, &mut dst[nw..]);
+ size_of::<u32>()
+ };
+ nw += self.flags.write_to::<E>(&mut dst[nw..])?;
+ nw += self.tt.write_to::<E>(&mut dst[nw..])?;
+ nw += self.st.write_to::<E>(&mut dst[nw..])?;
+ nw += self.special.write_to::<E>(&mut dst[nw..])?;
+ nw += self.quitset.write_to::<E>(&mut dst[nw..])?;
+ Ok(nw)
+ }
+
+ /// Return the total number of bytes required to serialize this DFA.
+ ///
+ /// This is useful for determining the size of the buffer required to pass
+ /// to one of the serialization routines:
+ ///
+ /// * [`DFA::write_to_little_endian`]
+ /// * [`DFA::write_to_big_endian`]
+ /// * [`DFA::write_to_native_endian`]
+ ///
+ /// Passing a buffer smaller than the size returned by this method will
+ /// result in a serialization error.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to dynamically allocate enough room to serialize
+ /// a sparse DFA.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// // Compile our original DFA.
+ /// let original_dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// let mut buf = vec![0; original_dfa.write_to_len()];
+ /// let written = original_dfa.write_to_native_endian(&mut buf)?;
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn write_to_len(&self) -> usize {
+ wire::write_label_len(LABEL)
+ + wire::write_endianness_check_len()
+ + wire::write_version_len()
+ + size_of::<u32>() // unused, intended for future flexibility
+ + self.flags.write_to_len()
+ + self.tt.write_to_len()
+ + self.st.write_to_len()
+ + self.special.write_to_len()
+ + self.quitset.write_to_len()
+ }
+}
+
+impl<'a> DFA<&'a [u8]> {
+ /// Safely deserialize a sparse DFA with a specific state identifier
+ /// representation. Upon success, this returns both the deserialized DFA
+ /// and the number of bytes read from the given slice. Namely, the contents
+ /// of the slice beyond the DFA are not read.
+ ///
+ /// Deserializing a DFA using this routine will never allocate heap memory.
+ /// For safety purposes, the DFA's transitions will be verified such that
+ /// every transition points to a valid state. If this verification is too
+ /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
+ /// will always execute in constant time.
+ ///
+ /// The bytes given must be generated by one of the serialization APIs
+ /// of a `DFA` using a semver compatible release of this crate. Those
+ /// include:
+ ///
+ /// * [`DFA::to_bytes_little_endian`]
+ /// * [`DFA::to_bytes_big_endian`]
+ /// * [`DFA::to_bytes_native_endian`]
+ /// * [`DFA::write_to_little_endian`]
+ /// * [`DFA::write_to_big_endian`]
+ /// * [`DFA::write_to_native_endian`]
+ ///
+ /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
+ /// `write_to` methods do not allocate and write to an existing slice
+ /// (which may be on the stack). Since deserialization always uses the
+ /// native endianness of the target platform, the serialization API you use
+ /// should match the endianness of the target platform. (It's often a good
+ /// idea to generate serialized DFAs for both forms of endianness and then
+ /// load the correct one based on endianness.)
+ ///
+ /// # Errors
+ ///
+ /// Generally speaking, it's easier to state the conditions in which an
+ /// error is _not_ returned. All of the following must be true:
+ ///
+ /// * The bytes given must be produced by one of the serialization APIs
+ /// on this DFA, as mentioned above.
+ /// * The endianness of the target platform matches the endianness used to
+ /// serialized the provided DFA.
+ ///
+ /// If any of the above are not true, then an error will be returned.
+ ///
+ /// Note that unlike deserializing a
+ /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has
+ /// no alignment requirements. That is, an alignment of `1` is valid.
+ ///
+ /// # Panics
+ ///
+ /// This routine will never panic for any input.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to serialize a DFA to raw bytes, deserialize it
+ /// and then use it for searching.
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// let bytes = initial.to_bytes_native_endian();
+ /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: loading a DFA from static memory
+ ///
+ /// One use case this library supports is the ability to serialize a
+ /// DFA to disk and then use `include_bytes!` to store it in a compiled
+ /// Rust program. Those bytes can then be cheaply deserialized into a
+ /// `DFA` structure at runtime and used for searching without having to
+ /// re-compile the DFA (which can be quite costly).
+ ///
+ /// We can show this in two parts. The first part is serializing the DFA to
+ /// a file:
+ ///
+ /// ```no_run
+ /// use regex_automata::dfa::sparse::DFA;
+ ///
+ /// let dfa = DFA::new("foo[0-9]+")?;
+ ///
+ /// // Write a big endian serialized version of this DFA to a file.
+ /// let bytes = dfa.to_bytes_big_endian();
+ /// std::fs::write("foo.bigendian.dfa", &bytes)?;
+ ///
+ /// // Do it again, but this time for little endian.
+ /// let bytes = dfa.to_bytes_little_endian();
+ /// std::fs::write("foo.littleendian.dfa", &bytes)?;
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And now the second part is embedding the DFA into the compiled program
+ /// and deserializing it at runtime on first use. We use conditional
+ /// compilation to choose the correct endianness. We do not need to employ
+ /// any special tricks to ensure a proper alignment, since a sparse DFA has
+ /// no alignment requirements.
+ ///
+ /// ```no_run
+ /// use regex_automata::{
+ /// dfa::{Automaton, sparse::DFA},
+ /// util::lazy::Lazy,
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// // This crate provides its own "lazy" type, kind of like
+ /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc
+ /// // no-std environments and let's us write this using completely
+ /// // safe code.
+ /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
+ /// # const _: &str = stringify! {
+ /// #[cfg(target_endian = "big")]
+ /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
+ /// #[cfg(target_endian = "little")]
+ /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
+ /// # };
+ /// # static BYTES: &[u8] = b"";
+ ///
+ /// let (dfa, _) = DFA::from_bytes(BYTES)
+ /// .expect("serialized DFA should be valid");
+ /// dfa
+ /// });
+ ///
+ /// let expected = Ok(Some(HalfMatch::must(0, 8)));
+ /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345")));
+ /// ```
+ ///
+ /// Alternatively, consider using
+ /// [`lazy_static`](https://crates.io/crates/lazy_static)
+ /// or
+ /// [`once_cell`](https://crates.io/crates/once_cell),
+ /// which will guarantee safety for you.
+ pub fn from_bytes(
+ slice: &'a [u8],
+ ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
+ // SAFETY: This is safe because we validate both the sparse transitions
+ // (by trying to decode every state) and start state ID list below. If
+ // either validation fails, then we return an error.
+ let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
+ dfa.tt.validate(&dfa.special)?;
+ dfa.st.validate(&dfa.special, &dfa.tt)?;
+ // N.B. dfa.special doesn't have a way to do unchecked deserialization,
+ // so it has already been validated.
+ Ok((dfa, nread))
+ }
+
+ /// Deserialize a DFA with a specific state identifier representation in
+ /// constant time by omitting the verification of the validity of the
+ /// sparse transitions.
+ ///
+ /// This is just like [`DFA::from_bytes`], except it can potentially return
+ /// a DFA that exhibits undefined behavior if its transitions contains
+ /// invalid state identifiers.
+ ///
+ /// This routine is useful if you need to deserialize a DFA cheaply and
+ /// cannot afford the transition validation performed by `from_bytes`.
+ ///
+ /// # Safety
+ ///
+ /// This routine is not safe because it permits callers to provide
+ /// arbitrary transitions with possibly incorrect state identifiers. While
+ /// the various serialization routines will never return an incorrect
+ /// DFA, there is no guarantee that the bytes provided here are correct.
+ /// While `from_bytes_unchecked` will still do several forms of basic
+ /// validation, this routine does not check that the transitions themselves
+ /// are correct. Given an incorrect transition table, it is possible for
+ /// the search routines to access out-of-bounds memory because of explicit
+ /// bounds check elision.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
+ ///
+ /// let initial = DFA::new("foo[0-9]+")?;
+ /// let bytes = initial.to_bytes_native_endian();
+ /// // SAFETY: This is guaranteed to be safe since the bytes given come
+ /// // directly from a compatible serialization routine.
+ /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
+ ///
+ /// let expected = Some(HalfMatch::must(0, 8));
+ /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub unsafe fn from_bytes_unchecked(
+ slice: &'a [u8],
+ ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
+ let mut nr = 0;
+
+ nr += wire::read_label(&slice[nr..], LABEL)?;
+ nr += wire::read_endianness_check(&slice[nr..])?;
+ nr += wire::read_version(&slice[nr..], VERSION)?;
+
+ let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
+ nr += size_of::<u32>();
+
+ let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
+ nr += nread;
+
+ let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
+ nr += nread;
+
+ let (special, nread) = Special::from_bytes(&slice[nr..])?;
+ nr += nread;
+ if special.max.as_usize() >= tt.sparse().len() {
+ return Err(DeserializeError::generic(
+ "max should not be greater than or equal to sparse bytes",
+ ));
+ }
+
+ let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
+ nr += nread;
+
+ // Prefilters don't support serialization, so they're always absent.
+ let pre = None;
+ Ok((DFA { tt, st, special, pre, quitset, flags }, nr))
+ }
+}
+
+impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ writeln!(f, "sparse::DFA(")?;
+ for state in self.tt.states() {
+ fmt_state_indicator(f, self, state.id())?;
+ writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?;
+ }
+ writeln!(f, "")?;
+ for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() {
+ if i % self.st.stride == 0 {
+ match anchored {
+ Anchored::No => writeln!(f, "START-GROUP(unanchored)")?,
+ Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?,
+ Anchored::Pattern(pid) => writeln!(
+ f,
+ "START_GROUP(pattern: {:?})",
+ pid.as_usize()
+ )?,
+ }
+ }
+ writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?;
+ }
+ writeln!(f, "state length: {:?}", self.tt.state_len)?;
+ writeln!(f, "pattern length: {:?}", self.pattern_len())?;
+ writeln!(f, "flags: {:?}", self.flags)?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+// SAFETY: We assert that our implementation of each method is correct.
+unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
+ #[inline]
+ fn is_special_state(&self, id: StateID) -> bool {
+ self.special.is_special_state(id)
+ }
+
+ #[inline]
+ fn is_dead_state(&self, id: StateID) -> bool {
+ self.special.is_dead_state(id)
+ }
+
+ #[inline]
+ fn is_quit_state(&self, id: StateID) -> bool {
+ self.special.is_quit_state(id)
+ }
+
+ #[inline]
+ fn is_match_state(&self, id: StateID) -> bool {
+ self.special.is_match_state(id)
+ }
+
+ #[inline]
+ fn is_start_state(&self, id: StateID) -> bool {
+ self.special.is_start_state(id)
+ }
+
+ #[inline]
+ fn is_accel_state(&self, id: StateID) -> bool {
+ self.special.is_accel_state(id)
+ }
+
+ // This is marked as inline to help dramatically boost sparse searching,
+ // which decodes each state it enters to follow the next transition.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn next_state(&self, current: StateID, input: u8) -> StateID {
+ let input = self.tt.classes.get(input);
+ self.tt.state(current).next(input)
+ }
+
+ #[inline]
+ unsafe fn next_state_unchecked(
+ &self,
+ current: StateID,
+ input: u8,
+ ) -> StateID {
+ self.next_state(current, input)
+ }
+
+ #[inline]
+ fn next_eoi_state(&self, current: StateID) -> StateID {
+ self.tt.state(current).next_eoi()
+ }
+
+ #[inline]
+ fn pattern_len(&self) -> usize {
+ self.tt.pattern_len
+ }
+
+ #[inline]
+ fn match_len(&self, id: StateID) -> usize {
+ self.tt.state(id).pattern_len()
+ }
+
+ #[inline]
+ fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
+ // This is an optimization for the very common case of a DFA with a
+ // single pattern. This conditional avoids a somewhat more costly path
+ // that finds the pattern ID from the state machine, which requires
+ // a bit of slicing/pointer-chasing. This optimization tends to only
+ // matter when matches are frequent.
+ if self.tt.pattern_len == 1 {
+ return PatternID::ZERO;
+ }
+ self.tt.state(id).pattern_id(match_index)
+ }
+
+ #[inline]
+ fn has_empty(&self) -> bool {
+ self.flags.has_empty
+ }
+
+ #[inline]
+ fn is_utf8(&self) -> bool {
+ self.flags.is_utf8
+ }
+
+ #[inline]
+ fn is_always_start_anchored(&self) -> bool {
+ self.flags.is_always_start_anchored
+ }
+
+ #[inline]
+ fn start_state_forward(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ if !self.quitset.is_empty() && input.start() > 0 {
+ let offset = input.start() - 1;
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start = self.st.start_map.fwd(&input);
+ self.st.start(input, start)
+ }
+
+ #[inline]
+ fn start_state_reverse(
+ &self,
+ input: &Input<'_>,
+ ) -> Result<StateID, MatchError> {
+ if !self.quitset.is_empty() && input.end() < input.haystack().len() {
+ let offset = input.end();
+ let byte = input.haystack()[offset];
+ if self.quitset.contains(byte) {
+ return Err(MatchError::quit(byte, offset));
+ }
+ }
+ let start = self.st.start_map.rev(&input);
+ self.st.start(input, start)
+ }
+
+ #[inline]
+ fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
+ match mode {
+ Anchored::No => self.st.universal_start_unanchored,
+ Anchored::Yes => self.st.universal_start_anchored,
+ Anchored::Pattern(_) => None,
+ }
+ }
+
+ #[inline]
+ fn accelerator(&self, id: StateID) -> &[u8] {
+ self.tt.state(id).accelerator()
+ }
+
+ #[inline]
+ fn get_prefilter(&self) -> Option<&Prefilter> {
+ self.pre.as_ref()
+ }
+}
+
+/// The transition table portion of a sparse DFA.
+///
+/// The transition table is the core part of the DFA in that it describes how
+/// to move from one state to another based on the input sequence observed.
+///
+/// Unlike a typical dense table based DFA, states in a sparse transition
+/// table have variable size. That is, states with more transitions use more
+/// space than states with fewer transitions. This means that finding the next
+/// transition takes more work than with a dense DFA, but also typically uses
+/// much less space.
+#[derive(Clone)]
+struct Transitions<T> {
+ /// The raw encoding of each state in this DFA.
+ ///
+ /// Each state has the following information:
+ ///
+ /// * A set of transitions to subsequent states. Transitions to the dead
+ /// state are omitted.
+ /// * If the state can be accelerated, then any additional accelerator
+ /// information.
+ /// * If the state is a match state, then the state contains all pattern
+ /// IDs that match when in that state.
+ ///
+ /// To decode a state, use Transitions::state.
+ ///
+ /// In practice, T is either Vec<u8> or &[u8].
+ sparse: T,
+ /// A set of equivalence classes, where a single equivalence class
+ /// represents a set of bytes that never discriminate between a match
+ /// and a non-match in the DFA. Each equivalence class corresponds to a
+ /// single character in this DFA's alphabet, where the maximum number of
+ /// characters is 257 (each possible value of a byte plus the special
+ /// EOI transition). Consequently, the number of equivalence classes
+ /// corresponds to the number of transitions for each DFA state. Note
+ /// though that the *space* used by each DFA state in the transition table
+ /// may be larger. The total space used by each DFA state is known as the
+ /// stride and is documented above.
+ ///
+ /// The only time the number of equivalence classes is fewer than 257 is
+ /// if the DFA's kind uses byte classes which is the default. Equivalence
+ /// classes should generally only be disabled when debugging, so that
+ /// the transitions themselves aren't obscured. Disabling them has no
+ /// other benefit, since the equivalence class map is always used while
+ /// searching. In the vast majority of cases, the number of equivalence
+ /// classes is substantially smaller than 257, particularly when large
+ /// Unicode classes aren't used.
+ ///
+ /// N.B. Equivalence classes aren't particularly useful in a sparse DFA
+ /// in the current implementation, since equivalence classes generally tend
+ /// to correspond to continuous ranges of bytes that map to the same
+ /// transition. So in a sparse DFA, equivalence classes don't really lead
+ /// to a space savings. In the future, it would be good to try and remove
+ /// them from sparse DFAs entirely, but requires a bit of work since sparse
+ /// DFAs are built from dense DFAs, which are in turn built on top of
+ /// equivalence classes.
+ classes: ByteClasses,
+ /// The total number of states in this DFA. Note that a DFA always has at
+ /// least one state---the dead state---even the empty DFA. In particular,
+ /// the dead state always has ID 0 and is correspondingly always the first
+ /// state. The dead state is never a match state.
+ state_len: usize,
+ /// The total number of unique patterns represented by these match states.
+ pattern_len: usize,
+}
+
+impl<'a> Transitions<&'a [u8]> {
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr().as_usize();
+
+ let (state_len, nr) =
+ wire::try_read_u32_as_usize(&slice, "state length")?;
+ slice = &slice[nr..];
+
+ let (pattern_len, nr) =
+ wire::try_read_u32_as_usize(&slice, "pattern length")?;
+ slice = &slice[nr..];
+
+ let (classes, nr) = ByteClasses::from_bytes(&slice)?;
+ slice = &slice[nr..];
+
+ let (len, nr) =
+ wire::try_read_u32_as_usize(&slice, "sparse transitions length")?;
+ slice = &slice[nr..];
+
+ wire::check_slice_len(slice, len, "sparse states byte length")?;
+ let sparse = &slice[..len];
+ slice = &slice[len..];
+
+ let trans = Transitions { sparse, classes, state_len, pattern_len };
+ Ok((trans, slice.as_ptr().as_usize() - slice_start))
+ }
+}
+
+impl<T: AsRef<[u8]>> Transitions<T> {
+ /// Writes a serialized form of this transition table to the buffer given.
+ /// If the buffer is too small, then an error is returned. To determine
+ /// how big the buffer must be, use `write_to_len`.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small(
+ "sparse transition table",
+ ));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write state length
+ E::write_u32(u32::try_from(self.state_len).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write pattern length
+ E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write byte class map
+ let n = self.classes.write_to(dst)?;
+ dst = &mut dst[n..];
+
+ // write number of bytes in sparse transitions
+ E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+
+ // write actual transitions
+ let mut id = DEAD;
+ while id.as_usize() < self.sparse().len() {
+ let state = self.state(id);
+ let n = state.write_to::<E>(&mut dst)?;
+ dst = &mut dst[n..];
+ // The next ID is the offset immediately following `state`.
+ id = StateID::new(id.as_usize() + state.write_to_len()).unwrap();
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this transition
+ /// table will use.
+ fn write_to_len(&self) -> usize {
+ size_of::<u32>() // state length
+ + size_of::<u32>() // pattern length
+ + self.classes.write_to_len()
+ + size_of::<u32>() // sparse transitions length
+ + self.sparse().len()
+ }
+
+ /// Validates that every state ID in this transition table is valid.
+ ///
+ /// That is, every state ID can be used to correctly index a state in this
+ /// table.
+ fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
+ // In order to validate everything, we not only need to make sure we
+ // can decode every state, but that every transition in every state
+ // points to a valid state. There are many duplicative transitions, so
+ // we record state IDs that we've verified so that we don't redo the
+ // decoding work.
+ //
+ // Except, when in no_std mode, we don't have dynamic memory allocation
+ // available to us, so we skip this optimization. It's not clear
+ // whether doing something more clever is worth it just yet. If you're
+ // profiling this code and need it to run faster, please file an issue.
+ //
+ // OK, so we also use this to record the set of valid state IDs. Since
+ // it is possible for a transition to point to an invalid state ID that
+ // still (somehow) deserializes to a valid state. So we need to make
+ // sure our transitions are limited to actually correct state IDs.
+ // The problem is, I'm not sure how to do this verification step in
+ // no-std no-alloc mode. I think we'd *have* to store the set of valid
+ // state IDs in the DFA itself. For now, we don't do this verification
+ // in no-std no-alloc mode. The worst thing that can happen is an
+ // incorrect result. But no panics or memory safety problems should
+ // result. Because we still do validate that the state itself is
+ // "valid" in the sense that everything it points to actually exists.
+ //
+ // ---AG
+ struct Seen {
+ #[cfg(feature = "alloc")]
+ set: alloc::collections::BTreeSet<StateID>,
+ #[cfg(not(feature = "alloc"))]
+ set: core::marker::PhantomData<StateID>,
+ }
+
+ #[cfg(feature = "alloc")]
+ impl Seen {
+ fn new() -> Seen {
+ Seen { set: alloc::collections::BTreeSet::new() }
+ }
+ fn insert(&mut self, id: StateID) {
+ self.set.insert(id);
+ }
+ fn contains(&self, id: &StateID) -> bool {
+ self.set.contains(id)
+ }
+ }
+
+ #[cfg(not(feature = "alloc"))]
+ impl Seen {
+ fn new() -> Seen {
+ Seen { set: core::marker::PhantomData }
+ }
+ fn insert(&mut self, _id: StateID) {}
+ fn contains(&self, _id: &StateID) -> bool {
+ false
+ }
+ }
+
+ let mut verified: Seen = Seen::new();
+ // We need to make sure that we decode the correct number of states.
+ // Otherwise, an empty set of transitions would validate even if the
+ // recorded state length is non-empty.
+ let mut len = 0;
+ // We can't use the self.states() iterator because it assumes the state
+ // encodings are valid. It could panic if they aren't.
+ let mut id = DEAD;
+ while id.as_usize() < self.sparse().len() {
+ // Before we even decode the state, we check that the ID itself
+ // is well formed. That is, if it's a special state then it must
+ // actually be a quit, dead, accel, match or start state.
+ if sp.is_special_state(id) {
+ let is_actually_special = sp.is_dead_state(id)
+ || sp.is_quit_state(id)
+ || sp.is_match_state(id)
+ || sp.is_start_state(id)
+ || sp.is_accel_state(id);
+ if !is_actually_special {
+ // This is kind of a cryptic error message...
+ return Err(DeserializeError::generic(
+ "found sparse state tagged as special but \
+ wasn't actually special",
+ ));
+ }
+ }
+ let state = self.try_state(sp, id)?;
+ verified.insert(id);
+ // The next ID should be the offset immediately following `state`.
+ id = StateID::new(wire::add(
+ id.as_usize(),
+ state.write_to_len(),
+ "next state ID offset",
+ )?)
+ .map_err(|err| {
+ DeserializeError::state_id_error(err, "next state ID offset")
+ })?;
+ len += 1;
+ }
+ // Now that we've checked that all top-level states are correct and
+ // importantly, collected a set of valid state IDs, we have all the
+ // information we need to check that all transitions are correct too.
+ //
+ // Note that we can't use `valid_ids` to iterate because it will
+ // be empty in no-std no-alloc contexts. (And yes, that means our
+ // verification isn't quite as good.) We can use `self.states()`
+ // though at least, since we know that all states can at least be
+ // decoded and traversed correctly.
+ for state in self.states() {
+ // Check that all transitions in this state are correct.
+ for i in 0..state.ntrans {
+ let to = state.next_at(i);
+ // For no-alloc, we just check that the state can decode. It is
+ // technically possible that the state ID could still point to
+ // a non-existent state even if it decodes (fuzzing proved this
+ // to be true), but it shouldn't result in any memory unsafety
+ // or panics in non-debug mode.
+ #[cfg(not(feature = "alloc"))]
+ {
+ let _ = self.try_state(sp, to)?;
+ }
+ #[cfg(feature = "alloc")]
+ {
+ if !verified.contains(&to) {
+ return Err(DeserializeError::generic(
+ "found transition that points to a \
+ non-existent state",
+ ));
+ }
+ }
+ }
+ }
+ if len != self.state_len {
+ return Err(DeserializeError::generic(
+ "mismatching sparse state length",
+ ));
+ }
+ Ok(())
+ }
+
+ /// Converts these transitions to a borrowed value.
+ fn as_ref(&self) -> Transitions<&'_ [u8]> {
+ Transitions {
+ sparse: self.sparse(),
+ classes: self.classes.clone(),
+ state_len: self.state_len,
+ pattern_len: self.pattern_len,
+ }
+ }
+
+ /// Converts these transitions to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> {
+ Transitions {
+ sparse: self.sparse().to_vec(),
+ classes: self.classes.clone(),
+ state_len: self.state_len,
+ pattern_len: self.pattern_len,
+ }
+ }
+
+ /// Return a convenient representation of the given state.
+ ///
+ /// This panics if the state is invalid.
+ ///
+ /// This is marked as inline to help dramatically boost sparse searching,
+ /// which decodes each state it enters to follow the next transition. Other
+ /// functions involved are also inlined, which should hopefully eliminate
+ /// a lot of the extraneous decoding that is never needed just to follow
+ /// the next transition.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn state(&self, id: StateID) -> State<'_> {
+ let mut state = &self.sparse()[id.as_usize()..];
+ let mut ntrans = wire::read_u16(&state).as_usize();
+ let is_match = (1 << 15) & ntrans != 0;
+ ntrans &= !(1 << 15);
+ state = &state[2..];
+
+ let (input_ranges, state) = state.split_at(ntrans * 2);
+ let (next, state) = state.split_at(ntrans * StateID::SIZE);
+ let (pattern_ids, state) = if is_match {
+ let npats = wire::read_u32(&state).as_usize();
+ state[4..].split_at(npats * 4)
+ } else {
+ (&[][..], state)
+ };
+
+ let accel_len = usize::from(state[0]);
+ let accel = &state[1..accel_len + 1];
+ State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
+ }
+
+ /// Like `state`, but will return an error if the state encoding is
+ /// invalid. This is useful for verifying states after deserialization,
+ /// which is required for a safe deserialization API.
+ ///
+ /// Note that this only verifies that this state is decodable and that
+ /// all of its data is consistent. It does not verify that its state ID
+ /// transitions point to valid states themselves, nor does it verify that
+ /// every pattern ID is valid.
+ fn try_state(
+ &self,
+ sp: &Special,
+ id: StateID,
+ ) -> Result<State<'_>, DeserializeError> {
+ if id.as_usize() > self.sparse().len() {
+ return Err(DeserializeError::generic(
+ "invalid caller provided sparse state ID",
+ ));
+ }
+ let mut state = &self.sparse()[id.as_usize()..];
+ // Encoding format starts with a u16 that stores the total number of
+ // transitions in this state.
+ let (mut ntrans, _) =
+ wire::try_read_u16_as_usize(state, "state transition length")?;
+ let is_match = ((1 << 15) & ntrans) != 0;
+ ntrans &= !(1 << 15);
+ state = &state[2..];
+ if ntrans > 257 || ntrans == 0 {
+ return Err(DeserializeError::generic(
+ "invalid transition length",
+ ));
+ }
+ if is_match && !sp.is_match_state(id) {
+ return Err(DeserializeError::generic(
+ "state marked as match but not in match ID range",
+ ));
+ } else if !is_match && sp.is_match_state(id) {
+ return Err(DeserializeError::generic(
+ "state in match ID range but not marked as match state",
+ ));
+ }
+
+ // Each transition has two pieces: an inclusive range of bytes on which
+ // it is defined, and the state ID that those bytes transition to. The
+ // pairs come first, followed by a corresponding sequence of state IDs.
+ let input_ranges_len = ntrans.checked_mul(2).unwrap();
+ wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
+ let (input_ranges, state) = state.split_at(input_ranges_len);
+ // Every range should be of the form A-B, where A<=B.
+ for pair in input_ranges.chunks(2) {
+ let (start, end) = (pair[0], pair[1]);
+ if start > end {
+ return Err(DeserializeError::generic("invalid input range"));
+ }
+ }
+
+ // And now extract the corresponding sequence of state IDs. We leave
+ // this sequence as a &[u8] instead of a &[S] because sparse DFAs do
+ // not have any alignment requirements.
+ let next_len = ntrans
+ .checked_mul(self.id_len())
+ .expect("state size * #trans should always fit in a usize");
+ wire::check_slice_len(state, next_len, "sparse trans state IDs")?;
+ let (next, state) = state.split_at(next_len);
+ // We can at least verify that every state ID is in bounds.
+ for idbytes in next.chunks(self.id_len()) {
+ let (id, _) =
+ wire::read_state_id(idbytes, "sparse state ID in try_state")?;
+ wire::check_slice_len(
+ self.sparse(),
+ id.as_usize(),
+ "invalid sparse state ID",
+ )?;
+ }
+
+ // If this is a match state, then read the pattern IDs for this state.
+ // Pattern IDs is a u32-length prefixed sequence of native endian
+ // encoded 32-bit integers.
+ let (pattern_ids, state) = if is_match {
+ let (npats, nr) =
+ wire::try_read_u32_as_usize(state, "pattern ID length")?;
+ let state = &state[nr..];
+ if npats == 0 {
+ return Err(DeserializeError::generic(
+ "state marked as a match, but has no pattern IDs",
+ ));
+ }
+
+ let pattern_ids_len =
+ wire::mul(npats, 4, "sparse pattern ID byte length")?;
+ wire::check_slice_len(
+ state,
+ pattern_ids_len,
+ "sparse pattern IDs",
+ )?;
+ let (pattern_ids, state) = state.split_at(pattern_ids_len);
+ for patbytes in pattern_ids.chunks(PatternID::SIZE) {
+ wire::read_pattern_id(
+ patbytes,
+ "sparse pattern ID in try_state",
+ )?;
+ }
+ (pattern_ids, state)
+ } else {
+ (&[][..], state)
+ };
+
+ // Now read this state's accelerator info. The first byte is the length
+ // of the accelerator, which is typically 0 (for no acceleration) but
+ // is no bigger than 3. The length indicates the number of bytes that
+ // follow, where each byte corresponds to a transition out of this
+ // state.
+ if state.is_empty() {
+ return Err(DeserializeError::generic("no accelerator length"));
+ }
+ let (accel_len, state) = (usize::from(state[0]), &state[1..]);
+
+ if accel_len > 3 {
+ return Err(DeserializeError::generic(
+ "sparse invalid accelerator length",
+ ));
+ } else if accel_len == 0 && sp.is_accel_state(id) {
+ return Err(DeserializeError::generic(
+ "got no accelerators in state, but in accelerator ID range",
+ ));
+ } else if accel_len > 0 && !sp.is_accel_state(id) {
+ return Err(DeserializeError::generic(
+ "state in accelerator ID range, but has no accelerators",
+ ));
+ }
+
+ wire::check_slice_len(
+ state,
+ accel_len,
+ "sparse corrupt accelerator length",
+ )?;
+ let (accel, _) = (&state[..accel_len], &state[accel_len..]);
+
+ let state = State {
+ id,
+ is_match,
+ ntrans,
+ input_ranges,
+ next,
+ pattern_ids,
+ accel,
+ };
+ if sp.is_quit_state(state.next_at(state.ntrans - 1)) {
+ return Err(DeserializeError::generic(
+ "state with EOI transition to quit state is illegal",
+ ));
+ }
+ Ok(state)
+ }
+
+ /// Return an iterator over all of the states in this DFA.
+ ///
+ /// The iterator returned yields tuples, where the first element is the
+ /// state ID and the second element is the state itself.
+ fn states(&self) -> StateIter<'_, T> {
+ StateIter { trans: self, id: DEAD.as_usize() }
+ }
+
+ /// Returns the sparse transitions as raw bytes.
+ fn sparse(&self) -> &[u8] {
+ self.sparse.as_ref()
+ }
+
+ /// Returns the number of bytes represented by a single state ID.
+ fn id_len(&self) -> usize {
+ StateID::SIZE
+ }
+
+ /// Return the memory usage, in bytes, of these transitions.
+ ///
+ /// This does not include the size of a `Transitions` value itself.
+ fn memory_usage(&self) -> usize {
+ self.sparse().len()
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl<T: AsMut<[u8]>> Transitions<T> {
+ /// Return a convenient mutable representation of the given state.
+ /// This panics if the state is invalid.
+ fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
+ let mut state = &mut self.sparse_mut()[id.as_usize()..];
+ let mut ntrans = wire::read_u16(&state).as_usize();
+ let is_match = (1 << 15) & ntrans != 0;
+ ntrans &= !(1 << 15);
+ state = &mut state[2..];
+
+ let (input_ranges, state) = state.split_at_mut(ntrans * 2);
+ let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
+ let (pattern_ids, state) = if is_match {
+ let npats = wire::read_u32(&state).as_usize();
+ state[4..].split_at_mut(npats * 4)
+ } else {
+ (&mut [][..], state)
+ };
+
+ let accel_len = usize::from(state[0]);
+ let accel = &mut state[1..accel_len + 1];
+ StateMut {
+ id,
+ is_match,
+ ntrans,
+ input_ranges,
+ next,
+ pattern_ids,
+ accel,
+ }
+ }
+
+ /// Returns the sparse transitions as raw mutable bytes.
+ fn sparse_mut(&mut self) -> &mut [u8] {
+ self.sparse.as_mut()
+ }
+}
+
+/// The set of all possible starting states in a DFA.
+///
+/// See the eponymous type in the `dense` module for more details. This type
+/// is very similar to `dense::StartTable`, except that its underlying
+/// representation is `&[u8]` instead of `&[S]`. (The latter would require
+/// sparse DFAs to be aligned, which is explicitly something we do not require
+/// because we don't really need it.)
+#[derive(Clone)]
+struct StartTable<T> {
+ /// The initial start state IDs as a contiguous table of native endian
+ /// encoded integers, represented by `S`.
+ ///
+ /// In practice, T is either Vec<u8> or &[u8] and has no alignment
+ /// requirements.
+ ///
+ /// The first `2 * stride` (currently always 8) entries always correspond
+ /// to the starts states for the entire DFA, with the first 4 entries being
+ /// for unanchored searches and the second 4 entries being for anchored
+ /// searches. To keep things simple, we always use 8 entries even if the
+ /// `StartKind` is not both.
+ ///
+ /// After that, there are `stride * patterns` state IDs, where `patterns`
+ /// may be zero in the case of a DFA with no patterns or in the case where
+ /// the DFA was built without enabling starting states for each pattern.
+ table: T,
+ /// The starting state configuration supported. When 'both', both
+ /// unanchored and anchored searches work. When 'unanchored', anchored
+ /// searches panic. When 'anchored', unanchored searches panic.
+ kind: StartKind,
+ /// The start state configuration for every possible byte.
+ start_map: StartByteMap,
+ /// The number of starting state IDs per pattern.
+ stride: usize,
+ /// The total number of patterns for which starting states are encoded.
+ /// This is `None` for DFAs that were built without start states for each
+ /// pattern. Thus, one cannot use this field to say how many patterns
+ /// are in the DFA in all cases. It is specific to how many patterns are
+ /// represented in this start table.
+ pattern_len: Option<usize>,
+ /// The universal starting state for unanchored searches. This is only
+ /// present when the DFA supports unanchored searches and when all starting
+ /// state IDs for an unanchored search are equivalent.
+ universal_start_unanchored: Option<StateID>,
+ /// The universal starting state for anchored searches. This is only
+ /// present when the DFA supports anchored searches and when all starting
+ /// state IDs for an anchored search are equivalent.
+ universal_start_anchored: Option<StateID>,
+}
+
+#[cfg(feature = "dfa-build")]
+impl StartTable<Vec<u8>> {
+ fn new<T: AsRef<[u32]>>(
+ dfa: &dense::DFA<T>,
+ pattern_len: Option<usize>,
+ ) -> StartTable<Vec<u8>> {
+ let stride = Start::len();
+ // This is OK since the only way we're here is if a dense DFA could be
+ // constructed successfully, which uses the same space.
+ let len = stride
+ .checked_mul(pattern_len.unwrap_or(0))
+ .unwrap()
+ .checked_add(stride.checked_mul(2).unwrap())
+ .unwrap()
+ .checked_mul(StateID::SIZE)
+ .unwrap();
+ StartTable {
+ table: vec![0; len],
+ kind: dfa.start_kind(),
+ start_map: dfa.start_map().clone(),
+ stride,
+ pattern_len,
+ universal_start_unanchored: dfa
+ .universal_start_state(Anchored::No),
+ universal_start_anchored: dfa.universal_start_state(Anchored::Yes),
+ }
+ }
+
+ fn from_dense_dfa<T: AsRef<[u32]>>(
+ dfa: &dense::DFA<T>,
+ remap: &[StateID],
+ ) -> Result<StartTable<Vec<u8>>, BuildError> {
+ // Unless the DFA has start states compiled for each pattern, then
+ // as far as the starting state table is concerned, there are zero
+ // patterns to account for. It will instead only store starting states
+ // for the entire DFA.
+ let start_pattern_len = if dfa.starts_for_each_pattern() {
+ Some(dfa.pattern_len())
+ } else {
+ None
+ };
+ let mut sl = StartTable::new(dfa, start_pattern_len);
+ for (old_start_id, anchored, sty) in dfa.starts() {
+ let new_start_id = remap[dfa.to_index(old_start_id)];
+ sl.set_start(anchored, sty, new_start_id);
+ }
+ Ok(sl)
+ }
+}
+
+impl<'a> StartTable<&'a [u8]> {
+ unsafe fn from_bytes_unchecked(
+ mut slice: &'a [u8],
+ ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
+ let slice_start = slice.as_ptr().as_usize();
+
+ let (kind, nr) = StartKind::from_bytes(slice)?;
+ slice = &slice[nr..];
+
+ let (start_map, nr) = StartByteMap::from_bytes(slice)?;
+ slice = &slice[nr..];
+
+ let (stride, nr) =
+ wire::try_read_u32_as_usize(slice, "sparse start table stride")?;
+ slice = &slice[nr..];
+ if stride != Start::len() {
+ return Err(DeserializeError::generic(
+ "invalid sparse starting table stride",
+ ));
+ }
+
+ let (maybe_pattern_len, nr) =
+ wire::try_read_u32_as_usize(slice, "sparse start table patterns")?;
+ slice = &slice[nr..];
+ let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX {
+ None
+ } else {
+ Some(maybe_pattern_len)
+ };
+ if pattern_len.map_or(false, |len| len > PatternID::LIMIT) {
+ return Err(DeserializeError::generic(
+ "sparse invalid number of patterns",
+ ));
+ }
+
+ let (universal_unanchored, nr) =
+ wire::try_read_u32(slice, "universal unanchored start")?;
+ slice = &slice[nr..];
+ let universal_start_unanchored = if universal_unanchored == u32::MAX {
+ None
+ } else {
+ Some(StateID::try_from(universal_unanchored).map_err(|e| {
+ DeserializeError::state_id_error(
+ e,
+ "universal unanchored start",
+ )
+ })?)
+ };
+
+ let (universal_anchored, nr) =
+ wire::try_read_u32(slice, "universal anchored start")?;
+ slice = &slice[nr..];
+ let universal_start_anchored = if universal_anchored == u32::MAX {
+ None
+ } else {
+ Some(StateID::try_from(universal_anchored).map_err(|e| {
+ DeserializeError::state_id_error(e, "universal anchored start")
+ })?)
+ };
+
+ let pattern_table_size = wire::mul(
+ stride,
+ pattern_len.unwrap_or(0),
+ "sparse invalid pattern length",
+ )?;
+ // Our start states always start with a single stride of start states
+ // for the entire automaton which permit it to match any pattern. What
+ // follows it are an optional set of start states for each pattern.
+ let start_state_len = wire::add(
+ wire::mul(2, stride, "start state stride too big")?,
+ pattern_table_size,
+ "sparse invalid 'any' pattern starts size",
+ )?;
+ let table_bytes_len = wire::mul(
+ start_state_len,
+ StateID::SIZE,
+ "sparse pattern table bytes length",
+ )?;
+ wire::check_slice_len(
+ slice,
+ table_bytes_len,
+ "sparse start ID table",
+ )?;
+ let table = &slice[..table_bytes_len];
+ slice = &slice[table_bytes_len..];
+
+ let sl = StartTable {
+ table,
+ kind,
+ start_map,
+ stride,
+ pattern_len,
+ universal_start_unanchored,
+ universal_start_anchored,
+ };
+ Ok((sl, slice.as_ptr().as_usize() - slice_start))
+ }
+}
+
+impl<T: AsRef<[u8]>> StartTable<T> {
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small(
+ "sparse starting table ids",
+ ));
+ }
+ dst = &mut dst[..nwrite];
+
+ // write start kind
+ let nw = self.kind.write_to::<E>(dst)?;
+ dst = &mut dst[nw..];
+ // write start byte map
+ let nw = self.start_map.write_to(dst)?;
+ dst = &mut dst[nw..];
+ // write stride
+ E::write_u32(u32::try_from(self.stride).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+ // write pattern length
+ E::write_u32(
+ u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write universal start unanchored state id, u32::MAX if absent
+ E::write_u32(
+ self.universal_start_unanchored
+ .map_or(u32::MAX, |sid| sid.as_u32()),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write universal start anchored state id, u32::MAX if absent
+ E::write_u32(
+ self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()),
+ dst,
+ );
+ dst = &mut dst[size_of::<u32>()..];
+ // write start IDs
+ for (sid, _, _) in self.iter() {
+ E::write_u32(sid.as_u32(), dst);
+ dst = &mut dst[StateID::SIZE..];
+ }
+ Ok(nwrite)
+ }
+
+ /// Returns the number of bytes the serialized form of this transition
+ /// table will use.
+ fn write_to_len(&self) -> usize {
+ self.kind.write_to_len()
+ + self.start_map.write_to_len()
+ + size_of::<u32>() // stride
+ + size_of::<u32>() // # patterns
+ + size_of::<u32>() // universal unanchored start
+ + size_of::<u32>() // universal anchored start
+ + self.table().len()
+ }
+
+ /// Validates that every starting state ID in this table is valid.
+ ///
+ /// That is, every starting state ID can be used to correctly decode a
+ /// state in the DFA's sparse transitions.
+ fn validate(
+ &self,
+ sp: &Special,
+ trans: &Transitions<T>,
+ ) -> Result<(), DeserializeError> {
+ for (id, _, _) in self.iter() {
+ if sp.is_match_state(id) {
+ return Err(DeserializeError::generic(
+ "start states cannot be match states",
+ ));
+ }
+ // Confirm that the start state points to a valid state.
+ let state = trans.try_state(sp, id)?;
+ // And like for the transition table, confirm that the transitions
+ // on all start states themselves point to a valid state.
+ //
+ // It'd probably be better to integrate this validation with the
+ // transition table, or otherwise store a sorted sequence of all
+ // valid state IDs in the sparse DFA itself. That way, we could
+ // check that every pointer to a state corresponds precisely to a
+ // correct and valid state.
+ for i in 0..state.ntrans {
+ let to = state.next_at(i);
+ let _ = trans.try_state(sp, to)?;
+ }
+ }
+ Ok(())
+ }
+
+ /// Converts this start list to a borrowed value.
+ fn as_ref(&self) -> StartTable<&'_ [u8]> {
+ StartTable {
+ table: self.table(),
+ kind: self.kind,
+ start_map: self.start_map.clone(),
+ stride: self.stride,
+ pattern_len: self.pattern_len,
+ universal_start_unanchored: self.universal_start_unanchored,
+ universal_start_anchored: self.universal_start_anchored,
+ }
+ }
+
+ /// Converts this start list to an owned value.
+ #[cfg(feature = "alloc")]
+ fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> {
+ StartTable {
+ table: self.table().to_vec(),
+ kind: self.kind,
+ start_map: self.start_map.clone(),
+ stride: self.stride,
+ pattern_len: self.pattern_len,
+ universal_start_unanchored: self.universal_start_unanchored,
+ universal_start_anchored: self.universal_start_anchored,
+ }
+ }
+
+ /// Return the start state for the given index and pattern ID. If the
+ /// pattern ID is None, then the corresponding start state for the entire
+ /// DFA is returned. If the pattern ID is not None, then the corresponding
+ /// starting state for the given pattern is returned. If this start table
+ /// does not have individual starting states for each pattern, then this
+ /// panics.
+ fn start(
+ &self,
+ input: &Input<'_>,
+ start: Start,
+ ) -> Result<StateID, MatchError> {
+ let start_index = start.as_usize();
+ let mode = input.get_anchored();
+ let index = match mode {
+ Anchored::No => {
+ if !self.kind.has_unanchored() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ start_index
+ }
+ Anchored::Yes => {
+ if !self.kind.has_anchored() {
+ return Err(MatchError::unsupported_anchored(mode));
+ }
+ self.stride + start_index
+ }
+ Anchored::Pattern(pid) => {
+ let len = match self.pattern_len {
+ None => {
+ return Err(MatchError::unsupported_anchored(mode))
+ }
+ Some(len) => len,
+ };
+ if pid.as_usize() >= len {
+ return Ok(DEAD);
+ }
+ (2 * self.stride)
+ + (self.stride * pid.as_usize())
+ + start_index
+ }
+ };
+ let start = index * StateID::SIZE;
+ // This OK since we're allowed to assume that the start table contains
+ // valid StateIDs.
+ Ok(wire::read_state_id_unchecked(&self.table()[start..]).0)
+ }
+
+ /// Return an iterator over all start IDs in this table.
+ fn iter(&self) -> StartStateIter<'_, T> {
+ StartStateIter { st: self, i: 0 }
+ }
+
+ /// Returns the total number of start state IDs in this table.
+ fn len(&self) -> usize {
+ self.table().len() / StateID::SIZE
+ }
+
+ /// Returns the table as a raw slice of bytes.
+ fn table(&self) -> &[u8] {
+ self.table.as_ref()
+ }
+
+ /// Return the memory usage, in bytes, of this start list.
+ ///
+ /// This does not include the size of a `StartTable` value itself.
+ fn memory_usage(&self) -> usize {
+ self.table().len()
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl<T: AsMut<[u8]>> StartTable<T> {
+ /// Set the start state for the given index and pattern.
+ ///
+ /// If the pattern ID or state ID are not valid, then this will panic.
+ fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) {
+ let start_index = start.as_usize();
+ let index = match anchored {
+ Anchored::No => start_index,
+ Anchored::Yes => self.stride + start_index,
+ Anchored::Pattern(pid) => {
+ let pid = pid.as_usize();
+ let len = self
+ .pattern_len
+ .expect("start states for each pattern enabled");
+ assert!(pid < len, "invalid pattern ID {:?}", pid);
+ self.stride
+ .checked_mul(pid)
+ .unwrap()
+ .checked_add(self.stride.checked_mul(2).unwrap())
+ .unwrap()
+ .checked_add(start_index)
+ .unwrap()
+ }
+ };
+ let start = index * StateID::SIZE;
+ let end = start + StateID::SIZE;
+ wire::write_state_id::<wire::NE>(
+ id,
+ &mut self.table.as_mut()[start..end],
+ );
+ }
+}
+
+/// An iterator over all state state IDs in a sparse DFA.
+struct StartStateIter<'a, T> {
+ st: &'a StartTable<T>,
+ i: usize,
+}
+
+impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
+ type Item = (StateID, Anchored, Start);
+
+ fn next(&mut self) -> Option<(StateID, Anchored, Start)> {
+ let i = self.i;
+ if i >= self.st.len() {
+ return None;
+ }
+ self.i += 1;
+
+ // This unwrap is okay since the stride of any DFA must always match
+ // the number of start state types.
+ let start_type = Start::from_usize(i % self.st.stride).unwrap();
+ let anchored = if i < self.st.stride {
+ Anchored::No
+ } else if i < (2 * self.st.stride) {
+ Anchored::Yes
+ } else {
+ let pid = (i - (2 * self.st.stride)) / self.st.stride;
+ Anchored::Pattern(PatternID::new(pid).unwrap())
+ };
+ let start = i * StateID::SIZE;
+ let end = start + StateID::SIZE;
+ let bytes = self.st.table()[start..end].try_into().unwrap();
+ // This is OK since we're allowed to assume that any IDs in this start
+ // table are correct and valid for this DFA.
+ let id = StateID::from_ne_bytes_unchecked(bytes);
+ Some((id, anchored, start_type))
+ }
+}
+
+impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("StartStateIter").field("i", &self.i).finish()
+ }
+}
+
+/// An iterator over all states in a sparse DFA.
+///
+/// This iterator yields tuples, where the first element is the state ID and
+/// the second element is the state itself.
+struct StateIter<'a, T> {
+ trans: &'a Transitions<T>,
+ id: usize,
+}
+
+impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
+ type Item = State<'a>;
+
+ fn next(&mut self) -> Option<State<'a>> {
+ if self.id >= self.trans.sparse().len() {
+ return None;
+ }
+ let state = self.trans.state(StateID::new_unchecked(self.id));
+ self.id = self.id + state.write_to_len();
+ Some(state)
+ }
+}
+
+impl<'a, T> fmt::Debug for StateIter<'a, T> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("StateIter").field("id", &self.id).finish()
+ }
+}
+
+/// A representation of a sparse DFA state that can be cheaply materialized
+/// from a state identifier.
+#[derive(Clone)]
+struct State<'a> {
+ /// The identifier of this state.
+ id: StateID,
+ /// Whether this is a match state or not.
+ is_match: bool,
+ /// The number of transitions in this state.
+ ntrans: usize,
+ /// Pairs of input ranges, where there is one pair for each transition.
+ /// Each pair specifies an inclusive start and end byte range for the
+ /// corresponding transition.
+ input_ranges: &'a [u8],
+ /// Transitions to the next state. This slice contains native endian
+ /// encoded state identifiers, with `S` as the representation. Thus, there
+ /// are `ntrans * size_of::<S>()` bytes in this slice.
+ next: &'a [u8],
+ /// If this is a match state, then this contains the pattern IDs that match
+ /// when the DFA is in this state.
+ ///
+ /// This is a contiguous sequence of 32-bit native endian encoded integers.
+ pattern_ids: &'a [u8],
+ /// An accelerator for this state, if present. If this state has no
+ /// accelerator, then this is an empty slice. When non-empty, this slice
+ /// has length at most 3 and corresponds to the exhaustive set of bytes
+ /// that must be seen in order to transition out of this state.
+ accel: &'a [u8],
+}
+
+impl<'a> State<'a> {
+ /// Searches for the next transition given an input byte. If no such
+ /// transition could be found, then a dead state is returned.
+ ///
+ /// This is marked as inline to help dramatically boost sparse searching,
+ /// which decodes each state it enters to follow the next transition.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn next(&self, input: u8) -> StateID {
+ // This straight linear search was observed to be much better than
+ // binary search on ASCII haystacks, likely because a binary search
+ // visits the ASCII case last but a linear search sees it first. A
+ // binary search does do a little better on non-ASCII haystacks, but
+ // not by much. There might be a better trade off lurking here.
+ for i in 0..(self.ntrans - 1) {
+ let (start, end) = self.range(i);
+ if start <= input && input <= end {
+ return self.next_at(i);
+ }
+ // We could bail early with an extra branch: if input < b1, then
+ // we know we'll never find a matching transition. Interestingly,
+ // this extra branch seems to not help performance, or will even
+ // hurt it. It's likely very dependent on the DFA itself and what
+ // is being searched.
+ }
+ DEAD
+ }
+
+ /// Returns the next state ID for the special EOI transition.
+ fn next_eoi(&self) -> StateID {
+ self.next_at(self.ntrans - 1)
+ }
+
+ /// Returns the identifier for this state.
+ fn id(&self) -> StateID {
+ self.id
+ }
+
+ /// Returns the inclusive input byte range for the ith transition in this
+ /// state.
+ fn range(&self, i: usize) -> (u8, u8) {
+ (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
+ }
+
+ /// Returns the next state for the ith transition in this state.
+ fn next_at(&self, i: usize) -> StateID {
+ let start = i * StateID::SIZE;
+ let end = start + StateID::SIZE;
+ let bytes = self.next[start..end].try_into().unwrap();
+ StateID::from_ne_bytes_unchecked(bytes)
+ }
+
+ /// Returns the pattern ID for the given match index. If the match index
+ /// is invalid, then this panics.
+ fn pattern_id(&self, match_index: usize) -> PatternID {
+ let start = match_index * PatternID::SIZE;
+ wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
+ }
+
+ /// Returns the total number of pattern IDs for this state. This is always
+ /// zero when `is_match` is false.
+ fn pattern_len(&self) -> usize {
+ assert_eq!(0, self.pattern_ids.len() % 4);
+ self.pattern_ids.len() / 4
+ }
+
+ /// Return an accelerator for this state.
+ fn accelerator(&self) -> &'a [u8] {
+ self.accel
+ }
+
+ /// Write the raw representation of this state to the given buffer using
+ /// the given endianness.
+ fn write_to<E: Endian>(
+ &self,
+ mut dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small(
+ "sparse state transitions",
+ ));
+ }
+
+ let ntrans =
+ if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans };
+ E::write_u16(u16::try_from(ntrans).unwrap(), dst);
+ dst = &mut dst[size_of::<u16>()..];
+
+ dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges);
+ dst = &mut dst[self.input_ranges.len()..];
+
+ for i in 0..self.ntrans {
+ E::write_u32(self.next_at(i).as_u32(), dst);
+ dst = &mut dst[StateID::SIZE..];
+ }
+
+ if self.is_match {
+ E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst);
+ dst = &mut dst[size_of::<u32>()..];
+ for i in 0..self.pattern_len() {
+ let pid = self.pattern_id(i);
+ E::write_u32(pid.as_u32(), dst);
+ dst = &mut dst[PatternID::SIZE..];
+ }
+ }
+
+ dst[0] = u8::try_from(self.accel.len()).unwrap();
+ dst[1..][..self.accel.len()].copy_from_slice(self.accel);
+
+ Ok(nwrite)
+ }
+
+ /// Return the total number of bytes that this state consumes in its
+ /// encoded form.
+ fn write_to_len(&self) -> usize {
+ let mut len = 2
+ + (self.ntrans * 2)
+ + (self.ntrans * StateID::SIZE)
+ + (1 + self.accel.len());
+ if self.is_match {
+ len += size_of::<u32>() + self.pattern_ids.len();
+ }
+ len
+ }
+}
+
+impl<'a> fmt::Debug for State<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut printed = false;
+ for i in 0..(self.ntrans - 1) {
+ let next = self.next_at(i);
+ if next == DEAD {
+ continue;
+ }
+
+ if printed {
+ write!(f, ", ")?;
+ }
+ let (start, end) = self.range(i);
+ if start == end {
+ write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?;
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next.as_usize(),
+ )?;
+ }
+ printed = true;
+ }
+ let eoi = self.next_at(self.ntrans - 1);
+ if eoi != DEAD {
+ if printed {
+ write!(f, ", ")?;
+ }
+ write!(f, "EOI => {:?}", eoi.as_usize())?;
+ }
+ Ok(())
+ }
+}
+
+/// A representation of a mutable sparse DFA state that can be cheaply
+/// materialized from a state identifier.
+#[cfg(feature = "dfa-build")]
+struct StateMut<'a> {
+ /// The identifier of this state.
+ id: StateID,
+ /// Whether this is a match state or not.
+ is_match: bool,
+ /// The number of transitions in this state.
+ ntrans: usize,
+ /// Pairs of input ranges, where there is one pair for each transition.
+ /// Each pair specifies an inclusive start and end byte range for the
+ /// corresponding transition.
+ input_ranges: &'a mut [u8],
+ /// Transitions to the next state. This slice contains native endian
+ /// encoded state identifiers, with `S` as the representation. Thus, there
+ /// are `ntrans * size_of::<S>()` bytes in this slice.
+ next: &'a mut [u8],
+ /// If this is a match state, then this contains the pattern IDs that match
+ /// when the DFA is in this state.
+ ///
+ /// This is a contiguous sequence of 32-bit native endian encoded integers.
+ pattern_ids: &'a [u8],
+ /// An accelerator for this state, if present. If this state has no
+ /// accelerator, then this is an empty slice. When non-empty, this slice
+ /// has length at most 3 and corresponds to the exhaustive set of bytes
+ /// that must be seen in order to transition out of this state.
+ accel: &'a mut [u8],
+}
+
+#[cfg(feature = "dfa-build")]
+impl<'a> StateMut<'a> {
+ /// Sets the ith transition to the given state.
+ fn set_next_at(&mut self, i: usize, next: StateID) {
+ let start = i * StateID::SIZE;
+ let end = start + StateID::SIZE;
+ wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]);
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+impl<'a> fmt::Debug for StateMut<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let state = State {
+ id: self.id,
+ is_match: self.is_match,
+ ntrans: self.ntrans,
+ input_ranges: self.input_ranges,
+ next: self.next,
+ pattern_ids: self.pattern_ids,
+ accel: self.accel,
+ };
+ fmt::Debug::fmt(&state, f)
+ }
+}
+
+/*
+/// A binary search routine specialized specifically to a sparse DFA state's
+/// transitions. Specifically, the transitions are defined as a set of pairs
+/// of input bytes that delineate an inclusive range of bytes. If the input
+/// byte is in the range, then the corresponding transition is a match.
+///
+/// This binary search accepts a slice of these pairs and returns the position
+/// of the matching pair (the ith transition), or None if no matching pair
+/// could be found.
+///
+/// Note that this routine is not currently used since it was observed to
+/// either decrease performance when searching ASCII, or did not provide enough
+/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
+/// for posterity in case we can find a way to use it.
+///
+/// In theory, we could use the standard library's search routine if we could
+/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
+/// guaranteed to be safe and is thus UB (since I don't think the in-memory
+/// representation of `(u8, u8)` has been nailed down). One could define a
+/// repr(C) type, but the casting doesn't seem justified.
+#[cfg_attr(feature = "perf-inline", inline(always))]
+fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
+ debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
+ debug_assert!(ranges.len() <= 512, "ranges should be short");
+
+ let (mut left, mut right) = (0, ranges.len() / 2);
+ while left < right {
+ let mid = (left + right) / 2;
+ let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
+ if needle < b1 {
+ right = mid;
+ } else if needle > b2 {
+ left = mid + 1;
+ } else {
+ return Some(mid);
+ }
+ }
+ None
+}
+*/
+
+#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
+mod tests {
+ use crate::{
+ dfa::{dense::DFA, Automaton},
+ nfa::thompson,
+ Input, MatchError,
+ };
+
+ // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
+ #[test]
+ fn heuristic_unicode_forward() {
+ let dfa = DFA::builder()
+ .configure(DFA::config().unicode_word_boundary(true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build(r"\b[0-9]+\b")
+ .unwrap()
+ .to_sparse()
+ .unwrap();
+
+ let input = Input::new("β123").range(2..);
+ let expected = MatchError::quit(0xB2, 1);
+ let got = dfa.try_search_fwd(&input);
+ assert_eq!(Err(expected), got);
+
+ let input = Input::new("123β").range(..3);
+ let expected = MatchError::quit(0xCE, 3);
+ let got = dfa.try_search_fwd(&input);
+ assert_eq!(Err(expected), got);
+ }
+
+ // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
+ #[test]
+ fn heuristic_unicode_reverse() {
+ let dfa = DFA::builder()
+ .configure(DFA::config().unicode_word_boundary(true))
+ .thompson(thompson::Config::new().reverse(true))
+ .build(r"\b[0-9]+\b")
+ .unwrap()
+ .to_sparse()
+ .unwrap();
+
+ let input = Input::new("β123").range(2..);
+ let expected = MatchError::quit(0xB2, 1);
+ let got = dfa.try_search_rev(&input);
+ assert_eq!(Err(expected), got);
+
+ let input = Input::new("123β").range(..3);
+ let expected = MatchError::quit(0xCE, 3);
+ let got = dfa.try_search_rev(&input);
+ assert_eq!(Err(expected), got);
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/special.rs b/third_party/rust/regex-automata/src/dfa/special.rs
new file mode 100644
index 0000000000..a831df5c55
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/special.rs
@@ -0,0 +1,494 @@
+use crate::{
+ dfa::DEAD,
+ util::{
+ primitives::StateID,
+ wire::{self, DeserializeError, Endian, SerializeError},
+ },
+};
+
+macro_rules! err {
+ ($msg:expr) => {
+ return Err(DeserializeError::generic($msg));
+ };
+}
+
+// Special represents the identifiers in a DFA that correspond to "special"
+// states. If a state is one or more of the following, then it is considered
+// special:
+//
+// * dead - A non-matching state where all outgoing transitions lead back to
+// itself. There is only one of these, regardless of whether minimization
+// has run. The dead state always has an ID of 0. i.e., It is always the
+// first state in a DFA.
+// * quit - A state that is entered whenever a byte is seen that should cause
+// a DFA to give up and stop searching. This results in a MatchError::quit
+// error being returned at search time. The default configuration for a DFA
+// has no quit bytes, which means this state is unreachable by default,
+// although it is always present for reasons of implementation simplicity.
+// This state is only reachable when the caller configures the DFA to quit
+// on certain bytes. There is always exactly one of these states and it
+// is always the second state. (Its actual ID depends on the size of the
+// alphabet in dense DFAs, since state IDs are premultiplied in order to
+// allow them to be used directly as indices into the transition table.)
+// * match - An accepting state, i.e., indicative of a match. There may be
+// zero or more of these states.
+// * accelerated - A state where all of its outgoing transitions, except a
+// few, loop back to itself. These states are candidates for acceleration
+// via memchr during search. There may be zero or more of these states.
+// * start - A non-matching state that indicates where the automaton should
+// start during a search. There is always at least one starting state and
+// all are guaranteed to be non-match states. (A start state cannot be a
+// match state because the DFAs in this crate delay all matches by one byte.
+// So every search that finds a match must move through one transition to
+// some other match state, even when searching an empty string.)
+//
+// These are not mutually exclusive categories. Namely, the following
+// overlappings can occur:
+//
+// * {dead, start} - If a DFA can never lead to a match and it is minimized,
+// then it will typically compile to something where all starting IDs point
+// to the DFA's dead state.
+// * {match, accelerated} - It is possible for a match state to have the
+// majority of its transitions loop back to itself, which means it's
+// possible for a match state to be accelerated.
+// * {start, accelerated} - Similarly, it is possible for a start state to be
+// accelerated. Note that it is possible for an accelerated state to be
+// neither a match or a start state. Also note that just because both match
+// and start states overlap with accelerated states does not mean that
+// match and start states overlap with each other. In fact, they are
+// guaranteed not to overlap.
+//
+// As a special mention, every DFA always has a dead and a quit state, even
+// though from the perspective of the DFA, they are equivalent. (Indeed,
+// minimization special cases them to ensure they don't get merged.) The
+// purpose of keeping them distinct is to use the quit state as a sentinel to
+// distguish between whether a search finished successfully without finding
+// anything or whether it gave up before finishing.
+//
+// So the main problem we want to solve here is the *fast* detection of whether
+// a state is special or not. And we also want to do this while storing as
+// little extra data as possible. AND we want to be able to quickly determine
+// which categories a state falls into above if it is special.
+//
+// We achieve this by essentially shuffling all special states to the beginning
+// of a DFA. That is, all special states appear before every other non-special
+// state. By representing special states this way, we can determine whether a
+// state is special or not by a single comparison, where special.max is the
+// identifier of the last special state in the DFA:
+//
+// if current_state <= special.max:
+// ... do something with special state
+//
+// The only thing left to do is to determine what kind of special state
+// it is. Because what we do next depends on that. Since special states
+// are typically rare, we can afford to do a bit more extra work, but we'd
+// still like this to be as fast as possible. The trick we employ here is to
+// continue shuffling states even within the special state range. Such that
+// one contiguous region corresponds to match states, another for start states
+// and then an overlapping range for accelerated states. At a high level, our
+// special state detection might look like this (for leftmost searching, where
+// we continue searching even after seeing a match):
+//
+// byte = input[offset]
+// current_state = next_state(current_state, byte)
+// offset += 1
+// if current_state <= special.max:
+// if current_state == 0:
+// # We can never leave a dead state, so this always marks the
+// # end of our search.
+// return last_match
+// if current_state == special.quit_id:
+// # A quit state means we give up. If he DFA has no quit state,
+// # then special.quit_id == 0 == dead, which is handled by the
+// # conditional above.
+// return Err(MatchError::quit { byte, offset: offset - 1 })
+// if special.min_match <= current_state <= special.max_match:
+// last_match = Some(offset)
+// if special.min_accel <= current_state <= special.max_accel:
+// offset = accelerate(input, offset)
+// last_match = Some(offset)
+// elif special.min_start <= current_state <= special.max_start:
+// offset = prefilter.find(input, offset)
+// if special.min_accel <= current_state <= special.max_accel:
+// offset = accelerate(input, offset)
+// elif special.min_accel <= current_state <= special.max_accel:
+// offset = accelerate(input, offset)
+//
+// There are some small details left out of the logic above. For example,
+// in order to accelerate a state, we need to know which bytes to search for.
+// This in turn implies some extra data we need to store in the DFA. To keep
+// things compact, we would ideally only store
+//
+// N = special.max_accel - special.min_accel + 1
+//
+// items. But state IDs are premultiplied, which means they are not contiguous.
+// So in order to take a state ID and index an array of accelerated structures,
+// we need to do:
+//
+// i = (state_id - special.min_accel) / stride
+//
+// (N.B. 'stride' is always a power of 2, so the above can be implemented via
+// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
+// 2^x=stride.)
+//
+// Moreover, some of these specialty categories may be empty. For example,
+// DFAs are not required to have any match states or any accelerated states.
+// In that case, the lower and upper bounds are both set to 0 (the dead state
+// ID) and the first `current_state == 0` check subsumes cases where the
+// ranges are empty.
+//
+// Loop unrolling, if applicable, has also been left out of the logic above.
+//
+// Graphically, the ranges look like this, where asterisks indicate ranges
+// that can be empty. Each 'x' is a state.
+//
+// quit
+// dead|
+// ||
+// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+// | | | | start | |
+// | |-------------| |-------| |
+// | match* | | | |
+// | | | | |
+// | |----------| | |
+// | accel* | |
+// | | |
+// | | |
+// |----------------------------|------------------------
+// special non-special*
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Special {
+ /// The identifier of the last special state in a DFA. A state is special
+ /// if and only if its identifier is less than or equal to `max`.
+ pub(crate) max: StateID,
+ /// The identifier of the quit state in a DFA. (There is no analogous field
+ /// for the dead state since the dead state's ID is always zero, regardless
+ /// of state ID size.)
+ pub(crate) quit_id: StateID,
+ /// The identifier of the first match state.
+ pub(crate) min_match: StateID,
+ /// The identifier of the last match state.
+ pub(crate) max_match: StateID,
+ /// The identifier of the first accelerated state.
+ pub(crate) min_accel: StateID,
+ /// The identifier of the last accelerated state.
+ pub(crate) max_accel: StateID,
+ /// The identifier of the first start state.
+ pub(crate) min_start: StateID,
+ /// The identifier of the last start state.
+ pub(crate) max_start: StateID,
+}
+
+impl Special {
+ /// Creates a new set of special ranges for a DFA. All ranges are initially
+ /// set to only contain the dead state. This is interpreted as an empty
+ /// range.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn new() -> Special {
+ Special {
+ max: DEAD,
+ quit_id: DEAD,
+ min_match: DEAD,
+ max_match: DEAD,
+ min_accel: DEAD,
+ max_accel: DEAD,
+ min_start: DEAD,
+ max_start: DEAD,
+ }
+ }
+
+ /// Remaps all of the special state identifiers using the function given.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
+ Special {
+ max: map(self.max),
+ quit_id: map(self.quit_id),
+ min_match: map(self.min_match),
+ max_match: map(self.max_match),
+ min_accel: map(self.min_accel),
+ max_accel: map(self.max_accel),
+ min_start: map(self.min_start),
+ max_start: map(self.max_start),
+ }
+ }
+
+ /// Deserialize the given bytes into special state ranges. If the slice
+ /// given is not big enough, then this returns an error. Similarly, if
+ /// any of the expected invariants around special state ranges aren't
+ /// upheld, an error is returned. Note that this does not guarantee that
+ /// the information returned is correct.
+ ///
+ /// Upon success, this returns the number of bytes read in addition to the
+ /// special state IDs themselves.
+ pub(crate) fn from_bytes(
+ mut slice: &[u8],
+ ) -> Result<(Special, usize), DeserializeError> {
+ wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
+
+ let mut nread = 0;
+ let mut read_id = |what| -> Result<StateID, DeserializeError> {
+ let (id, nr) = wire::try_read_state_id(slice, what)?;
+ nread += nr;
+ slice = &slice[StateID::SIZE..];
+ Ok(id)
+ };
+
+ let max = read_id("special max id")?;
+ let quit_id = read_id("special quit id")?;
+ let min_match = read_id("special min match id")?;
+ let max_match = read_id("special max match id")?;
+ let min_accel = read_id("special min accel id")?;
+ let max_accel = read_id("special max accel id")?;
+ let min_start = read_id("special min start id")?;
+ let max_start = read_id("special max start id")?;
+
+ let special = Special {
+ max,
+ quit_id,
+ min_match,
+ max_match,
+ min_accel,
+ max_accel,
+ min_start,
+ max_start,
+ };
+ special.validate()?;
+ assert_eq!(nread, special.write_to_len());
+ Ok((special, nread))
+ }
+
+ /// Validate that the information describing special states satisfies
+ /// all known invariants.
+ pub(crate) fn validate(&self) -> Result<(), DeserializeError> {
+ // Check that both ends of the range are DEAD or neither are.
+ if self.min_match == DEAD && self.max_match != DEAD {
+ err!("min_match is DEAD, but max_match is not");
+ }
+ if self.min_match != DEAD && self.max_match == DEAD {
+ err!("max_match is DEAD, but min_match is not");
+ }
+ if self.min_accel == DEAD && self.max_accel != DEAD {
+ err!("min_accel is DEAD, but max_accel is not");
+ }
+ if self.min_accel != DEAD && self.max_accel == DEAD {
+ err!("max_accel is DEAD, but min_accel is not");
+ }
+ if self.min_start == DEAD && self.max_start != DEAD {
+ err!("min_start is DEAD, but max_start is not");
+ }
+ if self.min_start != DEAD && self.max_start == DEAD {
+ err!("max_start is DEAD, but min_start is not");
+ }
+
+ // Check that ranges are well formed.
+ if self.min_match > self.max_match {
+ err!("min_match should not be greater than max_match");
+ }
+ if self.min_accel > self.max_accel {
+ err!("min_accel should not be greater than max_accel");
+ }
+ if self.min_start > self.max_start {
+ err!("min_start should not be greater than max_start");
+ }
+
+ // Check that ranges are ordered with respect to one another.
+ if self.matches() && self.quit_id >= self.min_match {
+ err!("quit_id should not be greater than min_match");
+ }
+ if self.accels() && self.quit_id >= self.min_accel {
+ err!("quit_id should not be greater than min_accel");
+ }
+ if self.starts() && self.quit_id >= self.min_start {
+ err!("quit_id should not be greater than min_start");
+ }
+ if self.matches() && self.accels() && self.min_accel < self.min_match {
+ err!("min_match should not be greater than min_accel");
+ }
+ if self.matches() && self.starts() && self.min_start < self.min_match {
+ err!("min_match should not be greater than min_start");
+ }
+ if self.accels() && self.starts() && self.min_start < self.min_accel {
+ err!("min_accel should not be greater than min_start");
+ }
+
+ // Check that max is at least as big as everything else.
+ if self.max < self.quit_id {
+ err!("quit_id should not be greater than max");
+ }
+ if self.max < self.max_match {
+ err!("max_match should not be greater than max");
+ }
+ if self.max < self.max_accel {
+ err!("max_accel should not be greater than max");
+ }
+ if self.max < self.max_start {
+ err!("max_start should not be greater than max");
+ }
+
+ Ok(())
+ }
+
+ /// Validate that the special state information is compatible with the
+ /// given state len.
+ pub(crate) fn validate_state_len(
+ &self,
+ len: usize,
+ stride2: usize,
+ ) -> Result<(), DeserializeError> {
+ // We assume that 'validate' has already passed, so we know that 'max'
+ // is truly the max. So all we need to check is that the max state ID
+ // is less than the state ID len. The max legal value here is len-1,
+ // which occurs when there are no non-special states.
+ if (self.max.as_usize() >> stride2) >= len {
+ err!("max should not be greater than or equal to state length");
+ }
+ Ok(())
+ }
+
+ /// Write the IDs and ranges for special states to the given byte buffer.
+ /// The buffer given must have enough room to store all data, otherwise
+ /// this will return an error. The number of bytes written is returned
+ /// on success. The number of bytes written is guaranteed to be a multiple
+ /// of 8.
+ pub(crate) fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ use crate::util::wire::write_state_id as write;
+
+ if dst.len() < self.write_to_len() {
+ return Err(SerializeError::buffer_too_small("special state ids"));
+ }
+
+ let mut nwrite = 0;
+ nwrite += write::<E>(self.max, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
+ nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
+
+ assert_eq!(
+ self.write_to_len(),
+ nwrite,
+ "expected to write certain number of bytes",
+ );
+ assert_eq!(
+ nwrite % 8,
+ 0,
+ "expected to write multiple of 8 bytes for special states",
+ );
+ Ok(nwrite)
+ }
+
+ /// Returns the total number of bytes written by `write_to`.
+ pub(crate) fn write_to_len(&self) -> usize {
+ 8 * StateID::SIZE
+ }
+
+ /// Sets the maximum special state ID based on the current values. This
+ /// should be used once all possible state IDs are set.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn set_max(&mut self) {
+ use core::cmp::max;
+ self.max = max(
+ self.quit_id,
+ max(self.max_match, max(self.max_accel, self.max_start)),
+ );
+ }
+
+ /// Sets the maximum special state ID such that starting states are not
+ /// considered "special." This also marks the min/max starting states as
+ /// DEAD such that 'is_start_state' always returns false, even if the state
+ /// is actually a starting state.
+ ///
+ /// This is useful when there is no prefilter set. It will avoid
+ /// ping-ponging between the hot path in the DFA search code and the start
+ /// state handling code, which is typically only useful for executing a
+ /// prefilter.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn set_no_special_start_states(&mut self) {
+ use core::cmp::max;
+ self.max = max(self.quit_id, max(self.max_match, self.max_accel));
+ self.min_start = DEAD;
+ self.max_start = DEAD;
+ }
+
+ /// Returns true if and only if the given state ID is a special state.
+ #[inline]
+ pub(crate) fn is_special_state(&self, id: StateID) -> bool {
+ id <= self.max
+ }
+
+ /// Returns true if and only if the given state ID is a dead state.
+ #[inline]
+ pub(crate) fn is_dead_state(&self, id: StateID) -> bool {
+ id == DEAD
+ }
+
+ /// Returns true if and only if the given state ID is a quit state.
+ #[inline]
+ pub(crate) fn is_quit_state(&self, id: StateID) -> bool {
+ !self.is_dead_state(id) && self.quit_id == id
+ }
+
+ /// Returns true if and only if the given state ID is a match state.
+ #[inline]
+ pub(crate) fn is_match_state(&self, id: StateID) -> bool {
+ !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
+ }
+
+ /// Returns true if and only if the given state ID is an accel state.
+ #[inline]
+ pub(crate) fn is_accel_state(&self, id: StateID) -> bool {
+ !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
+ }
+
+ /// Returns true if and only if the given state ID is a start state.
+ #[inline]
+ pub(crate) fn is_start_state(&self, id: StateID) -> bool {
+ !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
+ }
+
+ /// Returns the total number of match states for a dense table based DFA.
+ #[inline]
+ pub(crate) fn match_len(&self, stride: usize) -> usize {
+ if self.matches() {
+ (self.max_match.as_usize() - self.min_match.as_usize() + stride)
+ / stride
+ } else {
+ 0
+ }
+ }
+
+ /// Returns true if and only if there is at least one match state.
+ #[inline]
+ pub(crate) fn matches(&self) -> bool {
+ self.min_match != DEAD
+ }
+
+ /// Returns the total number of accel states.
+ #[cfg(feature = "dfa-build")]
+ pub(crate) fn accel_len(&self, stride: usize) -> usize {
+ if self.accels() {
+ (self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
+ / stride
+ } else {
+ 0
+ }
+ }
+
+ /// Returns true if and only if there is at least one accel state.
+ #[inline]
+ pub(crate) fn accels(&self) -> bool {
+ self.min_accel != DEAD
+ }
+
+ /// Returns true if and only if there is at least one start state.
+ #[inline]
+ pub(crate) fn starts(&self) -> bool {
+ self.min_start != DEAD
+ }
+}
diff --git a/third_party/rust/regex-automata/src/dfa/start.rs b/third_party/rust/regex-automata/src/dfa/start.rs
new file mode 100644
index 0000000000..fddc702df5
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/start.rs
@@ -0,0 +1,74 @@
+use core::mem::size_of;
+
+use crate::util::wire::{self, DeserializeError, Endian, SerializeError};
+
+/// The kind of anchored starting configurations to support in a DFA.
+///
+/// Fully compiled DFAs need to be explicitly configured as to which anchored
+/// starting configurations to support. The reason for not just supporting
+/// everything unconditionally is that it can use more resources (such as
+/// memory and build time). The downside of this is that if you try to execute
+/// a search using an [`Anchored`](crate::Anchored) mode that is not supported
+/// by the DFA, then the search will return an error.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub enum StartKind {
+ /// Support both anchored and unanchored searches.
+ Both,
+ /// Support only unanchored searches. Requesting an anchored search will
+ /// panic.
+ ///
+ /// Note that even if an unanchored search is requested, the pattern itself
+ /// may still be anchored. For example, `^abc` will only match `abc` at the
+ /// start of a haystack. This will remain true, even if the regex engine
+ /// only supported unanchored searches.
+ Unanchored,
+ /// Support only anchored searches. Requesting an unanchored search will
+ /// panic.
+ Anchored,
+}
+
+impl StartKind {
+ pub(crate) fn from_bytes(
+ slice: &[u8],
+ ) -> Result<(StartKind, usize), DeserializeError> {
+ wire::check_slice_len(slice, size_of::<u32>(), "start kind bytes")?;
+ let (n, nr) = wire::try_read_u32(slice, "start kind integer")?;
+ match n {
+ 0 => Ok((StartKind::Both, nr)),
+ 1 => Ok((StartKind::Unanchored, nr)),
+ 2 => Ok((StartKind::Anchored, nr)),
+ _ => Err(DeserializeError::generic("unrecognized start kind")),
+ }
+ }
+
+ pub(crate) fn write_to<E: Endian>(
+ &self,
+ dst: &mut [u8],
+ ) -> Result<usize, SerializeError> {
+ let nwrite = self.write_to_len();
+ if dst.len() < nwrite {
+ return Err(SerializeError::buffer_too_small("start kind"));
+ }
+ let n = match *self {
+ StartKind::Both => 0,
+ StartKind::Unanchored => 1,
+ StartKind::Anchored => 2,
+ };
+ E::write_u32(n, dst);
+ Ok(nwrite)
+ }
+
+ pub(crate) fn write_to_len(&self) -> usize {
+ size_of::<u32>()
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn has_unanchored(&self) -> bool {
+ matches!(*self, StartKind::Both | StartKind::Unanchored)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub(crate) fn has_anchored(&self) -> bool {
+ matches!(*self, StartKind::Both | StartKind::Anchored)
+ }
+}