diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
commit | c23a457e72abe608715ac76f076f47dc42af07a5 (patch) | |
tree | 2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata/src/dfa | |
parent | Releasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff) | |
download | rustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip |
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/dfa')
-rw-r--r-- | vendor/regex-automata/src/dfa/accel.rs | 89 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/automaton.rs | 1821 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/dense.rs | 2639 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/determinize.rs | 182 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/error.rs | 162 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/minimize.rs | 24 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/mod.rs | 135 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/onepass.rs | 3188 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/regex.rs | 1825 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/remapper.rs | 242 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/search.rs | 891 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/search_unsafe.rs | 321 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/sparse.rs | 1279 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/special.rs | 109 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/start.rs | 74 | ||||
-rw-r--r-- | vendor/regex-automata/src/dfa/transducer.rs | 207 |
16 files changed, 8112 insertions, 5076 deletions
diff --git a/vendor/regex-automata/src/dfa/accel.rs b/vendor/regex-automata/src/dfa/accel.rs index dbfeb7932..5ea2423dd 100644 --- a/vendor/regex-automata/src/dfa/accel.rs +++ b/vendor/regex-automata/src/dfa/accel.rs @@ -49,12 +49,14 @@ // // accels.get((id - min_accel_id) / dfa_stride) -use core::convert::{TryFrom, TryInto}; - -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] use alloc::{vec, vec::Vec}; -use crate::util::bytes::{self, DeserializeError, Endian, SerializeError}; +use crate::util::{ + int::Pointer, + memchr, + wire::{self, DeserializeError, Endian, SerializeError}, +}; /// The base type used to represent a collection of accelerators. /// @@ -87,7 +89,7 @@ const ACCEL_CAP: usize = 8; /// Search for between 1 and 3 needle bytes in the given haystack, starting the /// search at the given position. If `needles` has a length other than 1-3, /// then this panics. -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn find_fwd( needles: &[u8], haystack: &[u8], @@ -107,7 +109,7 @@ pub(crate) fn find_fwd( /// Search for between 1 and 3 needle bytes in the given haystack in reverse, /// starting the search at the given position. If `needles` has a length other /// than 1-3, then this panics. -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn find_rev( needles: &[u8], haystack: &[u8], @@ -138,7 +140,7 @@ pub(crate) struct Accels<A> { accels: A, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl Accels<Vec<AccelTy>> { /// Create an empty sequence of accelerators for a DFA. pub fn empty() -> Accels<Vec<AccelTy>> { @@ -180,48 +182,48 @@ impl<'a> Accels<&'a [AccelTy]> { /// /// Callers may check the validity of every accelerator with the `validate` /// method. - pub unsafe fn from_bytes_unchecked( + pub fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> { - let slice_start = slice.as_ptr() as usize; + let slice_start = slice.as_ptr().as_usize(); - let (count, _) = - bytes::try_read_u32_as_usize(slice, "accelerators count")?; - // The accelerator count is part of the accel_tys slice that + let (accel_len, _) = + wire::try_read_u32_as_usize(slice, "accelerators length")?; + // The accelerator length is part of the accel_tys slice that // we deserialize. This is perhaps a bit idiosyncratic. It would - // probably be better to split out the count into a real field. + // probably be better to split out the length into a real field. - let accel_tys_count = bytes::add( - bytes::mul(count, 2, "total number of accelerator accel_tys")?, + let accel_tys_len = wire::add( + wire::mul(accel_len, 2, "total number of accelerator accel_tys")?, 1, "total number of accel_tys", )?; - let accel_tys_len = bytes::mul( + let accel_tys_bytes_len = wire::mul( ACCEL_TY_SIZE, - accel_tys_count, + accel_tys_len, "total number of bytes in accelerators", )?; - bytes::check_slice_len(slice, accel_tys_len, "accelerators")?; - bytes::check_alignment::<AccelTy>(slice)?; - let accel_tys = &slice[..accel_tys_len]; - slice = &slice[accel_tys_len..]; + wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?; + wire::check_alignment::<AccelTy>(slice)?; + let accel_tys = &slice[..accel_tys_bytes_len]; + slice = &slice[accel_tys_bytes_len..]; // SAFETY: We've checked the length and alignment above, and since - // slice is just bytes, we can safely cast to a slice of &[AccelTy]. - #[allow(unused_unsafe)] + // slice is just bytes and AccelTy is just a u32, we can safely cast to + // a slice of &[AccelTy]. let accels = unsafe { core::slice::from_raw_parts( - accel_tys.as_ptr() as *const AccelTy, - accel_tys_count, + accel_tys.as_ptr().cast::<AccelTy>(), + accel_tys_len, ) }; - Ok((Accels { accels }, slice.as_ptr() as usize - slice_start)) + Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start)) } } impl<A: AsRef<[AccelTy]>> Accels<A> { /// Return an owned version of the accelerators. #[cfg(feature = "alloc")] - pub fn to_owned(&self) -> Accels<Vec<AccelTy>> { + pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> { Accels { accels: self.accels.as_ref().to_vec() } } @@ -237,7 +239,7 @@ impl<A: AsRef<[AccelTy]>> Accels<A> { // and u8 always has a smaller alignment. unsafe { core::slice::from_raw_parts( - accels.as_ptr() as *const u8, + accels.as_ptr().cast::<u8>(), accels.len() * ACCEL_TY_SIZE, ) } @@ -261,14 +263,14 @@ impl<A: AsRef<[AccelTy]>> Accels<A> { /// states are stored contiguously in the DFA and have an ordering implied /// by their respective state IDs. The state's index in that sequence /// corresponds to the index of its corresponding accelerator. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn needles(&self, i: usize) -> &[u8] { if i >= self.len() { panic!("invalid accelerator index {}", i); } let bytes = self.as_bytes(); let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; - let len = bytes[offset] as usize; + let len = usize::from(bytes[offset]); &bytes[offset + 1..offset + 1 + len] } @@ -398,7 +400,7 @@ pub(crate) struct Accel { impl Accel { /// Returns an empty accel, where no bytes are accelerated. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn new() -> Accel { Accel { bytes: [0; ACCEL_CAP] } } @@ -420,7 +422,7 @@ impl Accel { /// /// If the given bytes are invalid, then this returns an error. fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> { - if bytes[0] as usize >= ACCEL_LEN { + if usize::from(bytes[0]) >= ACCEL_LEN { return Err(DeserializeError::generic( "accelerator bytes cannot have length more than 3", )); @@ -438,18 +440,25 @@ impl Accel { } /// Attempts to add the given byte to this accelerator. If the accelerator - /// is already full then this returns false. Otherwise, returns true. + /// is already full or thinks the byte is a poor accelerator, then this + /// returns false. Otherwise, returns true. /// /// If the given byte is already in this accelerator, then it panics. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn add(&mut self, byte: u8) -> bool { if self.len() >= 3 { return false; } + // As a special case, we totally reject trying to accelerate a state + // with an ASCII space. In most cases, it occurs very frequently, and + // tends to result in worse overall performance. + if byte == b' ' { + return false; + } assert!( !self.contains(byte), "accelerator already contains {:?}", - crate::util::DebugByte(byte) + crate::util::escape::DebugByte(byte) ); self.bytes[self.len() + 1] = byte; self.bytes[0] += 1; @@ -458,11 +467,11 @@ impl Accel { /// Return the number of bytes in this accelerator. pub fn len(&self) -> usize { - self.bytes[0] as usize + usize::from(self.bytes[0]) } /// Returns true if and only if there are no bytes in this accelerator. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn is_empty(&self) -> bool { self.len() == 0 } @@ -476,13 +485,13 @@ impl Accel { /// Returns true if and only if this accelerator will accelerate the given /// byte. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn contains(&self, byte: u8) -> bool { self.needles().iter().position(|&b| b == byte).is_some() } /// Returns the accelerator bytes as an array of AccelTys. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn as_accel_tys(&self) -> [AccelTy; 2] { assert_eq!(ACCEL_CAP, 8); // These unwraps are OK since ACCEL_CAP is set to 8. @@ -499,7 +508,7 @@ impl core::fmt::Debug for Accel { write!(f, "Accel(")?; let mut set = f.debug_set(); for &b in self.needles() { - set.entry(&crate::util::DebugByte(b)); + set.entry(&crate::util::escape::DebugByte(b)); } set.finish()?; write!(f, ")") diff --git a/vendor/regex-automata/src/dfa/automaton.rs b/vendor/regex-automata/src/dfa/automaton.rs index 08bd6722a..7e2be9a15 100644 --- a/vendor/regex-automata/src/dfa/automaton.rs +++ b/vendor/regex-automata/src/dfa/automaton.rs @@ -1,9 +1,12 @@ +#[cfg(feature = "alloc")] +use crate::util::search::PatternSet; use crate::{ dfa::search, util::{ - id::{PatternID, StateID}, - matchtypes::{HalfMatch, MatchError}, - prefilter, + empty, + prefilter::Prefilter, + primitives::{PatternID, StateID}, + search::{Anchored, HalfMatch, Input, MatchError}, }, }; @@ -27,8 +30,8 @@ use crate::{ /// * A DFA can search for multiple patterns simultaneously. This /// means extra information is returned when a match occurs. Namely, /// a match is not just an offset, but an offset plus a pattern ID. -/// [`Automaton::pattern_count`] returns the number of patterns compiled into -/// the DFA, [`Automaton::match_count`] returns the total number of patterns +/// [`Automaton::pattern_len`] returns the number of patterns compiled into +/// the DFA, [`Automaton::match_len`] returns the total number of patterns /// that match in a particular state and [`Automaton::match_pattern`] permits /// iterating over the patterns that match in a particular state. /// * A DFA can have multiple start states, and the choice of which start @@ -76,12 +79,10 @@ use crate::{ /// the state can be queried via the [`Automaton::accelerator`] method. /// /// There are a number of provided methods on this trait that implement -/// efficient searching (for forwards and backwards) with a DFA using all of -/// the above features of this trait. In particular, given the complexity of -/// all these features, implementing a search routine in this trait is not -/// straight forward. If you need to do this for specialized reasons, then -/// it's recommended to look at the source of this crate. It is intentionally -/// well commented to help with this. With that said, it is possible to +/// efficient searching (for forwards and backwards) with a DFA using +/// all of the above features of this trait. In particular, given the +/// complexity of all these features, implementing a search routine in +/// this trait can be a little subtle. With that said, it is possible to /// somewhat simplify the search routine. For example, handling accelerated /// states is strictly optional, since it is always correct to assume that /// `Automaton::is_accel_state` returns false. However, one complex part of @@ -90,13 +91,19 @@ use crate::{ /// /// # Safety /// -/// This trait is unsafe to implement because DFA searching may rely on the -/// correctness of the implementation for memory safety. For example, DFA -/// searching may use explicit bounds check elision, which will in turn rely -/// on the correctness of every function that returns a state ID. +/// This trait is not safe to implement so that code may rely on the +/// correctness of implementations of this trait to avoid undefined behavior. +/// The primary correctness guarantees are: /// -/// When implementing this trait, one must uphold the documented correctness -/// guarantees. Otherwise, undefined behavior may occur. +/// * `Automaton::start_state` always returns a valid state ID or an error or +/// panics. +/// * `Automaton::next_state`, when given a valid state ID, always returns +/// a valid state ID for all values of `anchored` and `byte`, or otherwise +/// panics. +/// +/// In general, the rest of the methods on `Automaton` need to uphold their +/// contracts as well. For example, `Automaton::is_dead` should only returns +/// true if the given state ID is actually a dead state. pub unsafe trait Automaton { /// Transitions from the current state to the next state, given the next /// byte of input. @@ -118,16 +125,14 @@ pub unsafe trait Automaton { /// by using the `next_state` method. /// /// ``` - /// use regex_automata::dfa::{Automaton, dense}; + /// use regex_automata::{dfa::{Automaton, dense}, Input}; /// /// let dfa = dense::DFA::new(r"[a-z]+r")?; /// let haystack = "bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// let mut state = dfa.start_state_forward( - /// None, haystack, 0, haystack.len(), - /// ); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); @@ -195,16 +200,17 @@ pub unsafe trait Automaton { /// and then finishing the search with the final EOI transition. /// /// ``` - /// use regex_automata::dfa::{Automaton, dense}; + /// use regex_automata::{dfa::{Automaton, dense}, Input}; /// /// let dfa = dense::DFA::new(r"[a-z]+r")?; /// let haystack = "bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// let mut state = dfa.start_state_forward( - /// None, haystack, 0, haystack.len(), - /// ); + /// // + /// // The unwrap is OK because we aren't requesting a start state for a + /// // specific pattern. + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); @@ -220,78 +226,118 @@ pub unsafe trait Automaton { /// ``` fn next_eoi_state(&self, current: StateID) -> StateID; - /// Return the ID of the start state for this DFA when executing a forward - /// search. + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: /// - /// * The pattern ID, if present. When the underlying DFA has been compiled - /// with multiple patterns _and_ the DFA has been configured to compile - /// an anchored start state for each pattern, then a pattern ID may be - /// specified to execute an anchored search for that specific pattern. - /// If `pattern_id` is invalid or if the DFA doesn't have start states - /// compiled for each pattern, then implementations must panic. DFAs in - /// this crate can be configured to compile start states for each pattern - /// via - /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern). - /// * When `start > 0`, the byte at index `start - 1` may influence the - /// start state if the regex uses `^` or `\b`. - /// * Similarly, when `start == 0`, it may influence the start state when - /// the regex uses `^` or `\A`. - /// * Currently, `end` is unused. + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. /// * Whether the search is a forward or reverse search. This routine can /// only be used for forward searches. /// - /// # Panics + /// # Errors /// - /// Implementations must panic if `start..end` is not a valid sub-slice of - /// `bytes`. Implementations must also panic if `pattern_id` is non-None - /// and does not refer to a valid pattern, or if the DFA was not compiled - /// with anchored start states for each pattern. + /// This may return a [`MatchError`] if the search needs to give up + /// when determining the start state (for example, if it sees a "quit" + /// byte). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. fn start_state_forward( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID; + input: &Input<'_>, + ) -> Result<StateID, MatchError>; - /// Return the ID of the start state for this DFA when executing a reverse - /// search. + /// Return the ID of the start state for this lazy DFA when executing a + /// reverse search. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: /// - /// * The pattern ID, if present. When the underlying DFA has been compiled - /// with multiple patterns _and_ the DFA has been configured to compile an - /// anchored start state for each pattern, then a pattern ID may be - /// specified to execute an anchored search for that specific pattern. If - /// `pattern_id` is invalid or if the DFA doesn't have start states compiled - /// for each pattern, then implementations must panic. DFAs in this crate - /// can be configured to compile start states for each pattern via - /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern). - /// * When `end < bytes.len()`, the byte at index `end` may influence the - /// start state if the regex uses `$` or `\b`. - /// * Similarly, when `end == bytes.len()`, it may influence the start - /// state when the regex uses `$` or `\z`. - /// * Currently, `start` is unused. + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. /// * Whether the search is a forward or reverse search. This routine can /// only be used for reverse searches. /// - /// # Panics + /// # Errors /// - /// Implementations must panic if `start..end` is not a valid sub-slice of - /// `bytes`. Implementations must also panic if `pattern_id` is non-None - /// and does not refer to a valid pattern, or if the DFA was not compiled - /// with anchored start states for each pattern. + /// This may return a [`MatchError`] if the search needs to give up + /// when determining the start state (for example, if it sees a "quit" + /// byte). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. fn start_state_reverse( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID; + input: &Input<'_>, + ) -> Result<StateID, MatchError>; + + /// If this DFA has a universal starting state for the given anchor mode + /// and the DFA supports universal starting states, then this returns that + /// state's identifier. + /// + /// A DFA is said to have a universal starting state when the starting + /// state is invariant with respect to the haystack. Usually, the starting + /// state is chosen depending on the bytes immediately surrounding the + /// starting position of a search. However, the starting state only differs + /// when one or more of the patterns in the DFA have look-around assertions + /// in its prefix. + /// + /// Stated differently, if none of the patterns in a DFA have look-around + /// assertions in their prefix, then the DFA has a universal starting state + /// and _may_ be returned by this method. + /// + /// It is always correct for implementations to return `None`, and indeed, + /// this is what the default implementation does. When this returns `None`, + /// callers must use either `start_state_forward` or `start_state_reverse` + /// to get the starting state. + /// + /// # Use case + /// + /// There are a few reasons why one might want to use this: + /// + /// * If you know your regex patterns have no look-around assertions in + /// their prefix, then calling this routine is likely cheaper and perhaps + /// more semantically meaningful. + /// * When implementing prefilter support in a DFA regex implementation, + /// it is necessary to re-compute the start state after a candidate + /// is returned from the prefilter. However, this is only needed when + /// there isn't a universal start state. When one exists, one can avoid + /// re-computing the start state. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{Automaton, dense::DFA}, + /// Anchored, + /// }; + /// + /// // There are no look-around assertions in the prefixes of any of the + /// // patterns, so we get a universal start state. + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+$", "[A-Z]+"])?; + /// assert!(dfa.universal_start_state(Anchored::No).is_some()); + /// assert!(dfa.universal_start_state(Anchored::Yes).is_some()); + /// + /// // One of the patterns has a look-around assertion in its prefix, + /// // so this means there is no longer a universal start state. + /// let dfa = DFA::new_many(&["[0-9]+", "^[a-z]+$", "[A-Z]+"])?; + /// assert!(!dfa.universal_start_state(Anchored::No).is_some()); + /// assert!(!dfa.universal_start_state(Anchored::Yes).is_some()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + fn universal_start_state(&self, _mode: Anchored) -> Option<StateID> { + None + } /// Returns true if and only if the given identifier corresponds to a /// "special" state. A special state is one or more of the following: @@ -322,10 +368,10 @@ pub unsafe trait Automaton { /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, - /// HalfMatch, MatchError, PatternID, + /// HalfMatch, MatchError, Input, /// }; /// - /// fn find_leftmost_first<A: Automaton>( + /// fn find<A: Automaton>( /// dfa: &A, /// haystack: &[u8], /// ) -> Result<Option<HalfMatch>, MatchError> { @@ -333,9 +379,7 @@ pub unsafe trait Automaton { /// // initial bytes of the haystack. Note that start states can never /// // be match states (since DFAs in this crate delay matches by 1 /// // byte), so we don't need to check if the start state is a match. - /// let mut state = dfa.start_state_forward( - /// None, haystack, 0, haystack.len(), - /// ); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// let mut last_match = None; /// // Walk all the bytes in the haystack. We can quit early if we see /// // a dead or a quit state. The former means the automaton will @@ -358,7 +402,7 @@ pub unsafe trait Automaton { /// if last_match.is_some() { /// return Ok(last_match); /// } - /// return Err(MatchError::Quit { byte: b, offset: i }); + /// return Err(MatchError::quit(b, i)); /// } /// // Implementors may also want to check for start or accel /// // states and handle them differently for performance @@ -383,7 +427,7 @@ pub unsafe trait Automaton { /// // early. Greediness is built into the automaton. /// let dfa = dense::DFA::new(r"[a-z]+")?; /// let haystack = "123 foobar 4567".as_bytes(); - /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// let mat = find(&dfa, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 10); /// @@ -393,7 +437,7 @@ pub unsafe trait Automaton { /// // found until the final byte in the haystack. /// let dfa = dense::DFA::new(r"[0-9]{4}")?; /// let haystack = "123 foobar 4567".as_bytes(); - /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// let mat = find(&dfa, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 15); /// @@ -402,13 +446,13 @@ pub unsafe trait Automaton { /// // the appropriate pattern ID for us. /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; /// let haystack = "123 foobar 4567".as_bytes(); - /// let mat = find_leftmost_first(&dfa, haystack)?.unwrap(); + /// let mat = find(&dfa, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 1); /// assert_eq!(mat.offset(), 3); - /// let mat = find_leftmost_first(&dfa, &haystack[3..])?.unwrap(); + /// let mat = find(&dfa, &haystack[3..])?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 7); - /// let mat = find_leftmost_first(&dfa, &haystack[10..])?.unwrap(); + /// let mat = find(&dfa, &haystack[10..])?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 1); /// assert_eq!(mat.offset(), 5); /// @@ -458,13 +502,6 @@ pub unsafe trait Automaton { /// since state identifiers are pre-multiplied by the state machine's /// alphabet stride, and the alphabet stride varies between DFAs.) /// - /// By default, state machines created by this crate will never enter a - /// quit state. Since entering a quit state is the only way for a DFA - /// in this crate to fail at search time, it follows that the default - /// configuration can never produce a match error. Nevertheless, handling - /// quit states is necessary to correctly support all configurations in - /// this crate. - /// /// The typical way in which a quit state can occur is when heuristic /// support for Unicode word boundaries is enabled via the /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary) @@ -474,9 +511,8 @@ pub unsafe trait Automaton { /// purpose of the quit state is to provide a way to execute a fast DFA /// in common cases while delegating to slower routines when the DFA quits. /// - /// The default search implementations provided by this crate will return - /// a [`MatchError::Quit`](crate::MatchError::Quit) error when a quit state - /// is entered. + /// The default search implementations provided by this crate will return a + /// [`MatchError::quit`] error when a quit state is entered. /// /// # Example /// @@ -513,8 +549,10 @@ pub unsafe trait Automaton { /// method correctly. fn is_match_state(&self, id: StateID) -> bool; - /// Returns true if and only if the given identifier corresponds to a - /// start state. A start state is a state in which a DFA begins a search. + /// Returns true only if the given identifier corresponds to a start + /// state + /// + /// A start state is a state in which a DFA begins a search. /// All searches begin in a start state. Moreover, since all matches are /// delayed by one byte, a start state can never be a match state. /// @@ -531,25 +569,38 @@ pub unsafe trait Automaton { /// begin with that prefix, then skipping ahead to occurrences of that /// prefix may be much faster than executing the DFA. /// + /// As mentioned in the documentation for + /// [`is_special_state`](Automaton::is_special_state) implementations + /// _may_ always return false, even if the given identifier is a start + /// state. This is because knowing whether a state is a start state or not + /// is not necessary for correctness and is only treated as a potential + /// performance optimization. (For example, the implementations of this + /// trait in this crate will only return true when the given identifier + /// corresponds to a start state and when [specialization of start + /// states](crate::dfa::dense::Config::specialize_start_states) was enabled + /// during DFA construction. If start state specialization is disabled + /// (which is the default), then this method will always return false.) + /// /// # Example /// /// This example shows how to implement your own search routine that does /// a prefix search whenever the search enters a start state. /// - /// Note that you do not need to implement your own search routine to - /// make use of prefilters like this. The search routines provided - /// by this crate already implement prefilter support via the - /// [`Prefilter`](crate::util::prefilter::Prefilter) trait. The various - /// `find_*_at` routines on this trait support the `Prefilter` trait - /// through [`Scanner`](crate::util::prefilter::Scanner)s. This example is - /// meant to show how you might deal with prefilters in a simplified case - /// if you are implementing your own search routine. + /// Note that you do not need to implement your own search routine + /// to make use of prefilters like this. The search routines + /// provided by this crate already implement prefilter support via + /// the [`Prefilter`](crate::util::prefilter::Prefilter) trait. + /// A prefilter can be added to your search configuration with + /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter) for + /// dense and sparse DFAs in this crate. + /// + /// This example is meant to show how you might deal with prefilters in a + /// simplified case if you are implementing your own search routine. /// /// ``` /// use regex_automata::{ - /// MatchError, PatternID, /// dfa::{Automaton, dense}, - /// HalfMatch, + /// HalfMatch, MatchError, Input, /// }; /// /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option<usize> { @@ -558,7 +609,7 @@ pub unsafe trait Automaton { /// slice[at..].iter().position(|&b| b == byte).map(|i| at + i) /// } /// - /// fn find_leftmost_first<A: Automaton>( + /// fn find<A: Automaton>( /// dfa: &A, /// haystack: &[u8], /// prefix_byte: Option<u8>, @@ -566,9 +617,7 @@ pub unsafe trait Automaton { /// // See the Automaton::is_special_state example for similar code /// // with more comments. /// - /// let mut state = dfa.start_state_forward( - /// None, haystack, 0, haystack.len(), - /// ); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// let mut last_match = None; /// let mut pos = 0; /// while pos < haystack.len() { @@ -590,9 +639,7 @@ pub unsafe trait Automaton { /// if last_match.is_some() { /// return Ok(last_match); /// } - /// return Err(MatchError::Quit { - /// byte: b, offset: pos - 1, - /// }); + /// return Err(MatchError::quit(b, pos - 1)); /// } else if dfa.is_start_state(state) { /// // If we're in a start state and know all matches begin /// // with a particular byte, then we can quickly skip to @@ -620,22 +667,27 @@ pub unsafe trait Automaton { /// } /// /// // In this example, it's obvious that all occurrences of our pattern - /// // begin with 'Z', so we pass in 'Z'. - /// let dfa = dense::DFA::new(r"Z[a-z]+")?; + /// // begin with 'Z', so we pass in 'Z'. Note also that we need to + /// // enable start state specialization, or else it won't be possible to + /// // detect start states during a search. ('is_start_state' would always + /// // return false.) + /// let dfa = dense::DFA::builder() + /// .configure(dense::DFA::config().specialize_start_states(true)) + /// .build(r"Z[a-z]+")?; /// let haystack = "123 foobar Zbaz quux".as_bytes(); - /// let mat = find_leftmost_first(&dfa, haystack, Some(b'Z'))?.unwrap(); + /// let mat = find(&dfa, haystack, Some(b'Z'))?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 15); /// /// // But note that we don't need to pass in a prefix byte. If we don't, /// // then the search routine does no acceleration. - /// let mat = find_leftmost_first(&dfa, haystack, None)?.unwrap(); + /// let mat = find(&dfa, haystack, None)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 15); /// /// // However, if we pass an incorrect byte, then the prefix search will /// // result in incorrect results. - /// assert_eq!(find_leftmost_first(&dfa, haystack, Some(b'X'))?, None); + /// assert_eq!(find(&dfa, haystack, Some(b'X'))?, None); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` @@ -695,13 +747,13 @@ pub unsafe trait Automaton { /// /// # Example /// - /// This example shows the pattern count for a DFA that never matches: + /// This example shows the pattern length for a DFA that never matches: /// /// ``` /// use regex_automata::dfa::{Automaton, dense::DFA}; /// /// let dfa: DFA<Vec<u32>> = DFA::never_match()?; - /// assert_eq!(dfa.pattern_count(), 0); + /// assert_eq!(dfa.pattern_len(), 0); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// @@ -711,7 +763,7 @@ pub unsafe trait Automaton { /// use regex_automata::dfa::{Automaton, dense::DFA}; /// /// let dfa: DFA<Vec<u32>> = DFA::always_match()?; - /// assert_eq!(dfa.pattern_count(), 1); + /// assert_eq!(dfa.pattern_len(), 1); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// @@ -721,10 +773,10 @@ pub unsafe trait Automaton { /// use regex_automata::dfa::{Automaton, dense::DFA}; /// /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; - /// assert_eq!(dfa.pattern_count(), 3); + /// assert_eq!(dfa.pattern_len(), 3); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - fn pattern_count(&self) -> usize; + fn pattern_len(&self) -> usize; /// Returns the total number of patterns that match in this state. /// @@ -734,8 +786,8 @@ pub unsafe trait Automaton { /// If the DFA was compiled with one pattern, then this must necessarily /// always return `1` for all match states. /// - /// Implementations must guarantee that [`Automaton::match_pattern`] can - /// be called with indices up to (but not including) the count returned by + /// Implementations must guarantee that [`Automaton::match_pattern`] can be + /// called with indices up to (but not including) the length returned by /// this routine without panicking. /// /// # Panics @@ -750,12 +802,13 @@ pub unsafe trait Automaton { /// patterns have matched in a particular state, but also how to access /// which specific patterns have matched. /// - /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All) + /// Notice that we must use + /// [`MatchKind::All`](crate::MatchKind::All) /// when building the DFA. If we used /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst) - /// instead, then the DFA would not be constructed in a way that supports - /// overlapping matches. (It would only report a single pattern that - /// matches at any particular point in time.) + /// instead, then the DFA would not be constructed in a way that + /// supports overlapping matches. (It would only report a single pattern + /// that matches at any particular point in time.) /// /// Another thing to take note of is the patterns used and the order in /// which the pattern IDs are reported. In the example below, pattern `3` @@ -766,23 +819,19 @@ pub unsafe trait Automaton { /// other. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, dense}, - /// MatchKind, - /// }; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchKind}; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().match_kind(MatchKind::All)) /// .build_many(&[ - /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+", + /// r"[[:word:]]+", r"[a-z]+", r"[A-Z]+", r"[[:^space:]]+", /// ])?; /// let haystack = "@bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// let mut state = dfa.start_state_forward( - /// None, haystack, 0, haystack.len(), - /// ); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); @@ -790,8 +839,8 @@ pub unsafe trait Automaton { /// state = dfa.next_eoi_state(state); /// /// assert!(dfa.is_match_state(state)); - /// assert_eq!(dfa.match_count(state), 3); - /// // The following calls are guaranteed to not panic since `match_count` + /// assert_eq!(dfa.match_len(state), 3); + /// // The following calls are guaranteed to not panic since `match_len` /// // returned `3` above. /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3); /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0); @@ -799,19 +848,19 @@ pub unsafe trait Automaton { /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - fn match_count(&self, id: StateID) -> usize; + fn match_len(&self, id: StateID) -> usize; /// Returns the pattern ID corresponding to the given match index in the /// given state. /// - /// See [`Automaton::match_count`] for an example of how to use this + /// See [`Automaton::match_len`] for an example of how to use this /// method correctly. Note that if you know your DFA is compiled with a /// single pattern, then this routine is never necessary since it will /// always return a pattern ID of `0` for an index of `0` when `id` /// corresponds to a match state. /// /// Typically, this routine is used when implementing an overlapping - /// search, as the example for `Automaton::match_count` does. + /// search, as the example for `Automaton::match_len` does. /// /// # Panics /// @@ -822,12 +871,182 @@ pub unsafe trait Automaton { /// `PatternID`. fn match_pattern(&self, id: StateID, index: usize) -> PatternID; + /// Returns true if and only if this automaton can match the empty string. + /// When it returns false, all possible matches are guaranteed to have a + /// non-zero length. + /// + /// This is useful as cheap way to know whether code needs to handle the + /// case of a zero length match. This is particularly important when UTF-8 + /// modes are enabled, as when UTF-8 mode is enabled, empty matches that + /// split a codepoint must never be reported. This extra handling can + /// sometimes be costly, and since regexes matching an empty string are + /// somewhat rare, it can be beneficial to treat such regexes specially. + /// + /// # Example + /// + /// This example shows a few different DFAs and whether they match the + /// empty string or not. Notice the empty string isn't merely a matter + /// of a string of length literally `0`, but rather, whether a match can + /// occur between specific pairs of bytes. + /// + /// ``` + /// use regex_automata::{dfa::{dense::DFA, Automaton}, util::syntax}; + /// + /// // The empty regex matches the empty string. + /// let dfa = DFA::new("")?; + /// assert!(dfa.has_empty(), "empty matches empty"); + /// // The '+' repetition operator requires at least one match, and so + /// // does not match the empty string. + /// let dfa = DFA::new("a+")?; + /// assert!(!dfa.has_empty(), "+ does not match empty"); + /// // But the '*' repetition operator does. + /// let dfa = DFA::new("a*")?; + /// assert!(dfa.has_empty(), "* does match empty"); + /// // And wrapping '+' in an operator that can match an empty string also + /// // causes it to match the empty string too. + /// let dfa = DFA::new("(a+)*")?; + /// assert!(dfa.has_empty(), "+ inside of * matches empty"); + /// + /// // If a regex is just made of a look-around assertion, even if the + /// // assertion requires some kind of non-empty string around it (such as + /// // \b), then it is still treated as if it matches the empty string. + /// // Namely, if a match occurs of just a look-around assertion, then the + /// // match returned is empty. + /// let dfa = DFA::builder() + /// .configure(DFA::config().unicode_word_boundary(true)) + /// .syntax(syntax::Config::new().utf8(false)) + /// .build(r"^$\A\z\b\B(?-u:\b\B)")?; + /// assert!(dfa.has_empty(), "assertions match empty"); + /// // Even when an assertion is wrapped in a '+', it still matches the + /// // empty string. + /// let dfa = DFA::new(r"^+")?; + /// assert!(dfa.has_empty(), "+ of an assertion matches empty"); + /// + /// // An alternation with even one branch that can match the empty string + /// // is also said to match the empty string overall. + /// let dfa = DFA::new("foo|(bar)?|quux")?; + /// assert!(dfa.has_empty(), "alternations can match empty"); + /// + /// // An NFA that matches nothing does not match the empty string. + /// let dfa = DFA::new("[a&&b]")?; + /// assert!(!dfa.has_empty(), "never matching means not matching empty"); + /// // But if it's wrapped in something that doesn't require a match at + /// // all, then it can match the empty string! + /// let dfa = DFA::new("[a&&b]*")?; + /// assert!(dfa.has_empty(), "* on never-match still matches empty"); + /// // Since a '+' requires a match, using it on something that can never + /// // match will itself produce a regex that can never match anything, + /// // and thus does not match the empty string. + /// let dfa = DFA::new("[a&&b]+")?; + /// assert!(!dfa.has_empty(), "+ on never-match still matches nothing"); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn has_empty(&self) -> bool; + + /// Whether UTF-8 mode is enabled for this DFA or not. + /// + /// When UTF-8 mode is enabled, all matches reported by a DFA are + /// guaranteed to correspond to spans of valid UTF-8. This includes + /// zero-width matches. For example, the DFA must guarantee that the empty + /// regex will not match at the positions between code units in the UTF-8 + /// encoding of a single codepoint. + /// + /// See [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) for + /// more information. + /// + /// # Example + /// + /// This example shows how UTF-8 mode can impact the match spans that may + /// be reported in certain cases. + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// nfa::thompson, + /// HalfMatch, Input, + /// }; + /// + /// // UTF-8 mode is enabled by default. + /// let re = DFA::new("")?; + /// assert!(re.is_utf8()); + /// let mut input = Input::new("☃"); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 0)), got); + /// + /// // Even though an empty regex matches at 1..1, our next match is + /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is + /// // three bytes long). + /// input.set_start(1); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 3)), got); + /// + /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: + /// let re = DFA::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build("")?; + /// assert!(!re.is_utf8()); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 1)), got); + /// + /// input.set_start(2); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 2)), got); + /// + /// input.set_start(3); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(Some(HalfMatch::must(0, 3)), got); + /// + /// input.set_start(4); + /// let got = re.try_search_fwd(&input)?; + /// assert_eq!(None, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn is_utf8(&self) -> bool; + + /// Returns true if and only if this DFA is limited to returning matches + /// whose start position is `0`. + /// + /// Note that if you're using DFAs provided by + /// this crate, then this is _orthogonal_ to + /// [`Config::start_kind`](crate::dfa::dense::Config::start_kind). + /// + /// This is useful in some cases because if a DFA is limited to producing + /// matches that start at offset `0`, then a reverse search is never + /// required for finding the start of a match. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::{dense::DFA, Automaton}; + /// + /// // The empty regex matches anywhere + /// let dfa = DFA::new("")?; + /// assert!(!dfa.is_always_start_anchored(), "empty matches anywhere"); + /// // 'a' matches anywhere. + /// let dfa = DFA::new("a")?; + /// assert!(!dfa.is_always_start_anchored(), "'a' matches anywhere"); + /// // '^' only matches at offset 0! + /// let dfa = DFA::new("^a")?; + /// assert!(dfa.is_always_start_anchored(), "'^a' matches only at 0"); + /// // But '(?m:^)' matches at 0 but at other offsets too. + /// let dfa = DFA::new("(?m:^)a")?; + /// assert!(!dfa.is_always_start_anchored(), "'(?m:^)a' matches anywhere"); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + fn is_always_start_anchored(&self) -> bool; + /// Return a slice of bytes to accelerate for the given state, if possible. /// /// If the given state has no accelerator, then an empty slice must be - /// returned. If `Automaton::is_accel_state` returns true for the given - /// ID, then this routine _must_ return a non-empty slice, but it is not - /// required to do so. + /// returned. If `Automaton::is_accel_state` returns true for the given ID, + /// then this routine _must_ return a non-empty slice. But note that it is + /// not required for an implementation of this trait to ever return `true` + /// for `is_accel_state`, even if the state _could_ be accelerated. That + /// is, acceleration is an optional optimization. But the return values of + /// `is_accel_state` and `accelerator` must be in sync. /// /// If the given ID is not a valid state ID for this automaton, then /// implementations may panic or produce incorrect results. @@ -844,22 +1063,19 @@ pub unsafe trait Automaton { /// /// ``` /// use regex_automata::{ - /// nfa::thompson, /// dfa::{Automaton, dense}, - /// util::id::StateID, - /// SyntaxConfig, + /// util::{primitives::StateID, syntax}, /// }; /// /// let dfa = dense::Builder::new() /// // We disable Unicode everywhere and permit the regex to match - /// // invalid UTF-8. e.g., `[^abc]` matches `\xFF`, which is not valid - /// // UTF-8. - /// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) - /// // This makes the implicit `(?s:.)*?` prefix added to the regex - /// // match through arbitrary bytes instead of being UTF-8 aware. This - /// // isn't necessary to get acceleration to work in this case, but - /// // it does make the DFA substantially simpler. - /// .thompson(thompson::Config::new().utf8(false)) + /// // invalid UTF-8. e.g., [^abc] matches \xFF, which is not valid + /// // UTF-8. If we left Unicode enabled, [^abc] would match any UTF-8 + /// // encoding of any Unicode scalar value except for 'a', 'b' or 'c'. + /// // That translates to a much more complicated DFA, and also + /// // inhibits the 'accelerator' optimization that we are trying to + /// // demonstrate in this example. + /// .syntax(syntax::Config::new().unicode(false).utf8(false)) /// .build("[^abc]+a")?; /// /// // Here we just pluck out the state that we know is accelerated. @@ -875,154 +1091,58 @@ pub unsafe trait Automaton { /// assert_eq!(accelerator, &[b'a', b'b', b'c']); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` + #[inline] fn accelerator(&self, _id: StateID) -> &[u8] { &[] } - /// Executes a forward search and returns the end position of the first - /// match that is found as early as possible. If no match exists, then - /// `None` is returned. - /// - /// This routine stops scanning input as soon as the search observes a - /// match state. This is useful for implementing boolean `is_match`-like - /// routines, where as little work is done as possible. - /// - /// See [`Automaton::find_earliest_fwd_at`] for additional functionality, - /// such as providing a prefilter, a specific pattern to match and the - /// bounds of the search within the haystack. This routine is meant as - /// a convenience for common cases where the additional functionality is - /// not needed. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// # Example - /// - /// This example shows how to use this method with a - /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates - /// how the position returned might differ from what one might expect when - /// executing a traditional leftmost search. - /// - /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, dense}, - /// HalfMatch, - /// }; - /// - /// let dfa = dense::DFA::new("foo[0-9]+")?; - /// // Normally, the end of the leftmost first match here would be 8, - /// // corresponding to the end of the input. But the "earliest" semantics - /// // this routine cause it to stop as soon as a match is known, which - /// // occurs once 'foo[0-9]' has matched. - /// let expected = HalfMatch::must(0, 4); - /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"foo12345")?); - /// - /// let dfa = dense::DFA::new("abc|a")?; - /// // Normally, the end of the leftmost first match here would be 3, - /// // but the shortest match semantics detect a match earlier. - /// let expected = HalfMatch::must(0, 1); - /// assert_eq!(Some(expected), dfa.find_earliest_fwd(b"abc")?); - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - #[inline] - fn find_earliest_fwd( - &self, - bytes: &[u8], - ) -> Result<Option<HalfMatch>, MatchError> { - self.find_earliest_fwd_at(None, None, bytes, 0, bytes.len()) - } - - /// Executes a reverse search and returns the start position of the first - /// match that is found as early as possible. If no match exists, then - /// `None` is returned. - /// - /// This routine stops scanning input as soon as the search observes a - /// match state. - /// - /// Note that while it is not technically necessary to build a reverse - /// automaton to use a reverse search, it is likely that you'll want to do - /// so. Namely, the typical use of a reverse search is to find the starting - /// location of a match once its end is discovered from a forward search. A - /// reverse DFA automaton can be built by configuring the intermediate NFA - /// to be reversed via - /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse). + /// Returns the prefilter associated with a DFA, if one exists. /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// # Example - /// - /// This example shows how to use this method with a - /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, it demonstrates - /// how the position returned might differ from what one might expect when - /// executing a traditional leftmost reverse search. - /// - /// ``` - /// use regex_automata::{ - /// nfa::thompson, - /// dfa::{Automaton, dense}, - /// HalfMatch, - /// }; + /// The default implementation of this trait always returns `None`. And + /// indeed, it is always correct to return `None`. /// - /// let dfa = dense::Builder::new() - /// .thompson(thompson::Config::new().reverse(true)) - /// .build("[a-z]+[0-9]+")?; - /// // Normally, the end of the leftmost first match here would be 0, - /// // corresponding to the beginning of the input. But the "earliest" - /// // semantics of this routine cause it to stop as soon as a match is - /// // known, which occurs once '[a-z][0-9]+' has matched. - /// let expected = HalfMatch::must(0, 2); - /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"foo12345")?); + /// For DFAs in this crate, a prefilter can be attached to a DFA via + /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter). /// - /// let dfa = dense::Builder::new() - /// .thompson(thompson::Config::new().reverse(true)) - /// .build("abc|c")?; - /// // Normally, the end of the leftmost first match here would be 0, - /// // but the shortest match semantics detect a match earlier. - /// let expected = HalfMatch::must(0, 2); - /// assert_eq!(Some(expected), dfa.find_earliest_rev(b"abc")?); - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` + /// Do note that prefilters are not serialized by DFAs in this crate. + /// So if you deserialize a DFA that had a prefilter attached to it + /// at serialization time, then it will not have a prefilter after + /// deserialization. #[inline] - fn find_earliest_rev( - &self, - bytes: &[u8], - ) -> Result<Option<HalfMatch>, MatchError> { - self.find_earliest_rev_at(None, bytes, 0, bytes.len()) + fn get_prefilter(&self) -> Option<&Prefilter> { + None } /// Executes a forward search and returns the end position of the leftmost /// match that is found. If no match exists, then `None` is returned. /// + /// In particular, this method continues searching even after it enters + /// a match state. The search only terminates once it has reached the + /// end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: /// - /// When a search cannot complete, callers cannot know whether a match + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Notes for implementors /// /// Implementors of this trait are not required to implement any particular /// match semantics (such as leftmost-first), which are instead manifest in - /// the DFA's transitions. + /// the DFA's transitions. But this search routine should behave as a + /// general "leftmost" search. /// /// In particular, this method must continue searching even after it enters /// a match state. The search should only terminate once it has reached @@ -1036,47 +1156,124 @@ pub unsafe trait Automaton { /// # Example /// /// This example shows how to use this method with a - /// [`dense::DFA`](crate::dfa::dense::DFA). By default, a dense DFA uses - /// "leftmost first" match semantics. - /// - /// Leftmost first match semantics corresponds to the match with the - /// smallest starting offset, but where the end offset is determined by - /// preferring earlier branches in the original regular expression. For - /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` - /// will match `Samwise` in `Samwise`. - /// - /// Generally speaking, the "leftmost first" match is how most backtracking - /// regular expressions tend to work. This is in contrast to POSIX-style - /// regular expressions that yield "leftmost longest" matches. Namely, - /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using - /// leftmost longest semantics. (This crate does not currently support - /// leftmost longest semantics.) + /// [`dense::DFA`](crate::dfa::dense::DFA). /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, dense}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::new("foo[0-9]+")?; - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"foo12345"))?); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = dense::DFA::new("abc|a")?; - /// let expected = HalfMatch::must(0, 3); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"abc")?); + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"abc"))?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-DFA that permits searching for + /// specific patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::{Automaton, dense}, + /// Anchored, HalfMatch, PatternID, Input, + /// }; + /// + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new().starts_for_each_pattern(true)) + /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let haystack = "foo123".as_bytes(); + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.try_search_fwd(&Input::new(haystack))?; + /// assert_eq!(expected, got); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// let expected = Some(HalfMatch::must(1, 6)); + /// let got = dfa.try_search_fwd(&input)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; + /// + /// // N.B. We disable Unicode here so that we use a simple ASCII word + /// // boundary. Alternatively, we could enable heuristic support for + /// // Unicode word boundaries. + /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?; + /// let haystack = "foo123bar".as_bytes(); + /// + /// // Since we sub-slice the haystack, the search doesn't know about the + /// // larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `3` instead of `6`. + /// let input = Input::new(&haystack[3..6]); + /// let expected = Some(HalfMatch::must(0, 3)); + /// let got = dfa.try_search_fwd(&input)?; + /// assert_eq!(expected, got); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let input = Input::new(haystack).range(3..6); + /// let expected = None; + /// let got = dfa.try_search_fwd(&input)?; + /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - fn find_leftmost_fwd( + fn try_search_fwd( &self, - bytes: &[u8], + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - self.find_leftmost_fwd_at(None, None, bytes, 0, bytes.len()) + let utf8empty = self.has_empty() && self.is_utf8(); + let hm = match search::find_fwd(&self, input)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + // We get to this point when we know our DFA can match the empty string + // AND when UTF-8 mode is enabled. In this case, we skip any matches + // whose offset splits a codepoint. Such a match is necessarily a + // zero-width match, because UTF-8 mode requires the underlying NFA + // to be built such that all non-empty matches span valid UTF-8. + // Therefore, any match that ends in the middle of a codepoint cannot + // be part of a span of valid UTF-8 and thus must be an empty match. + // In such cases, we skip it, so as not to report matches that split a + // codepoint. + // + // Note that this is not a checked assumption. Callers *can* provide an + // NFA with UTF-8 mode enabled but produces non-empty matches that span + // invalid UTF-8. But doing so is documented to result in unspecified + // behavior. + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + let got = search::find_fwd(&self, input)?; + Ok(got.map(|hm| (hm, hm.offset()))) + }) } /// Executes a reverse search and returns the start of the position of the @@ -1085,52 +1282,42 @@ pub unsafe trait Automaton { /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// # Notes for implementors + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: /// - /// Implementors of this trait are not required to implement any particular - /// match semantics (such as leftmost-first), which are instead manifest in - /// the DFA's transitions. - /// - /// In particular, this method must continue searching even after it enters - /// a match state. The search should only terminate once it has reached - /// the end of the input or when it has entered a dead or quit state. Upon - /// termination, the position of the last byte seen while still in a match - /// state is returned. + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// Since this trait provides an implementation for this method by default, - /// it's unlikely that one will need to implement this. + /// When a search returns an error, callers cannot know whether a match + /// exists or not. /// /// # Example /// /// This example shows how to use this method with a - /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this routine - /// is principally useful when used in conjunction with the + /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this + /// routine is principally useful when used in conjunction with the /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) - /// configuration. In general, it's unlikely to be correct to use both - /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since any - /// particular DFA will only support searching in one direction with + /// configuration. In general, it's unlikely to be correct to use + /// both `try_search_fwd` and `try_search_rev` with the same DFA since + /// any particular DFA will only support searching in one direction with /// respect to the pattern. /// /// ``` /// use regex_automata::{ /// nfa::thompson, /// dfa::{Automaton, dense}, - /// HalfMatch, + /// HalfMatch, Input, /// }; /// /// let dfa = dense::Builder::new() /// .thompson(thompson::Config::new().reverse(true)) /// .build("foo[0-9]+")?; - /// let expected = HalfMatch::must(0, 0); - /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"foo12345"))?); /// /// // Even though a match is found after reading the last byte (`c`), /// // the leftmost first match semantics demand that we find the earliest @@ -1138,21 +1325,134 @@ pub unsafe trait Automaton { /// let dfa = dense::Builder::new() /// .thompson(thompson::Config::new().reverse(true)) /// .build("abc|c")?; - /// let expected = HalfMatch::must(0, 0); - /// assert_eq!(Some(expected), dfa.find_leftmost_rev(b"abc")?); + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"abc"))?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: UTF-8 mode + /// + /// This examples demonstrates that UTF-8 mode applies to reverse + /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all + /// matches reported must correspond to valid UTF-8 spans. This includes + /// prohibiting zero-width matches that split a codepoint. + /// + /// UTF-8 mode is enabled by default. Notice below how the only zero-width + /// matches reported are those at UTF-8 boundaries: + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build(r"")?; + /// + /// // Run the reverse DFA to collect all matches. + /// let mut input = Input::new("☃"); + /// let mut matches = vec![]; + /// loop { + /// match dfa.try_search_rev(&input)? { + /// None => break, + /// Some(hm) => { + /// matches.push(hm); + /// if hm.offset() == 0 || input.end() == 0 { + /// break; + /// } else if hm.offset() < input.end() { + /// input.set_end(hm.offset()); + /// } else { + /// // This is only necessary to handle zero-width + /// // matches, which of course occur in this example. + /// // Without this, the search would never advance + /// // backwards beyond the initial match. + /// input.set_end(input.end() - 1); + /// } + /// } + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Now let's look at the same example, but with UTF-8 mode on the + /// original NFA disabled (which results in disabling UTF-8 mode on the + /// DFA): + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true).utf8(false)) + /// .build(r"")?; + /// + /// // Run the reverse DFA to collect all matches. + /// let mut input = Input::new("☃"); + /// let mut matches = vec![]; + /// loop { + /// match dfa.try_search_rev(&input)? { + /// None => break, + /// Some(hm) => { + /// matches.push(hm); + /// if hm.offset() == 0 || input.end() == 0 { + /// break; + /// } else if hm.offset() < input.end() { + /// input.set_end(hm.offset()); + /// } else { + /// // This is only necessary to handle zero-width + /// // matches, which of course occur in this example. + /// // Without this, the search would never advance + /// // backwards beyond the initial match. + /// input.set_end(input.end() - 1); + /// } + /// } + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 2), + /// HalfMatch::must(0, 1), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - fn find_leftmost_rev( + fn try_search_rev( &self, - bytes: &[u8], + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - self.find_leftmost_rev_at(None, bytes, 0, bytes.len()) + let utf8empty = self.has_empty() && self.is_utf8(); + let hm = match search::find_rev(self, input)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + empty::skip_splits_rev(input, hm, hm.offset(), |input| { + let got = search::find_rev(self, input)?; + Ok(got.map(|hm| (hm, hm.offset()))) + }) } - /// Executes an overlapping forward search and returns the end position of - /// matches as they are found. If no match exists, then `None` is returned. + /// Executes an overlapping forward search. Matches, if one exists, can be + /// obtained via the [`OverlappingState::get_match`] method. /// /// This routine is principally only useful when searching for multiple /// patterns on inputs where multiple patterns may match the same regions @@ -1160,14 +1460,30 @@ pub unsafe trait Automaton { /// state from prior calls so that the implementation knows where the last /// match occurred. /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should always be set to the end + /// of the last match. If more patterns match at the previous location, + /// then they will be immediately returned. (This is tracked by the given + /// overlapping state.) Otherwise, the search continues at the starting + /// position given. + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// When a search cannot complete, callers cannot know whether a match + /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example @@ -1187,21 +1503,21 @@ pub unsafe trait Automaton { /// to find totally new matches (potentially of other patterns). /// /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::{Automaton, OverlappingState, dense}, - /// HalfMatch, - /// MatchKind, + /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().match_kind(MatchKind::All)) - /// .build_many(&[r"\w+$", r"\S+$"])?; - /// let haystack = "@foo".as_bytes(); + /// .build_many(&[r"[[:word:]]+$", r"[[:^space:]]+$"])?; + /// let haystack = "@foo"; /// let mut state = OverlappingState::start(); /// /// let expected = Some(HalfMatch::must(1, 4)); - /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; - /// assert_eq!(expected, got); + /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?; + /// assert_eq!(expected, state.get_match()); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first @@ -1209,394 +1525,260 @@ pub unsafe trait Automaton { /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(HalfMatch::must(0, 4)); - /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; - /// assert_eq!(expected, got); + /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?; + /// assert_eq!(expected, state.get_match()); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - fn find_overlapping_fwd( + fn try_search_overlapping_fwd( &self, - bytes: &[u8], + input: &Input<'_>, state: &mut OverlappingState, - ) -> Result<Option<HalfMatch>, MatchError> { - self.find_overlapping_fwd_at(None, None, bytes, 0, bytes.len(), state) + ) -> Result<(), MatchError> { + let utf8empty = self.has_empty() && self.is_utf8(); + search::find_overlapping_fwd(self, input, state)?; + match state.get_match() { + None => Ok(()), + Some(_) if !utf8empty => Ok(()), + Some(_) => skip_empty_utf8_splits_overlapping( + input, + state, + |input, state| { + search::find_overlapping_fwd(self, input, state) + }, + ), + } } - /// Executes a forward search and returns the end position of the first - /// match that is found as early as possible. If no match exists, then - /// `None` is returned. - /// - /// This routine stops scanning input as soon as the search observes a - /// match state. This is useful for implementing boolean `is_match`-like - /// routines, where as little work is done as possible. - /// - /// This is like [`Automaton::find_earliest_fwd`], except it provides some - /// additional control over how the search is executed: - /// - /// * `pre` is a prefilter scanner that, when given, is used whenever the - /// DFA enters its starting state. This is meant to speed up searches where - /// one or a small number of literal prefixes are known. - /// * `pattern_id` specifies a specific pattern in the DFA to run an - /// anchored search for. If not given, then a search for any pattern is - /// performed. For DFAs built by this crate, - /// [`dense::Config::starts_for_each_pattern`](crate::dfa::dense::Config::starts_for_each_pattern) - /// must be enabled to use this functionality. - /// * `start` and `end` permit searching a specific region of the haystack - /// `bytes`. This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `bytes`. (Because the existence of look-around - /// operations such as `\b`, `^` and `$` need to take the surrounding - /// context into account. This cannot be done if the haystack doesn't - /// contain it.) - /// - /// The examples below demonstrate each of these additional parameters. + /// Executes a reverse overlapping forward search. Matches, if one exists, + /// can be obtained via the [`OverlappingState::get_match`] method. /// - /// # Errors + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should remain invariant throughout + /// iteration. The `OverlappingState` given to the search will keep track + /// of the current position of the search. (This is because multiple + /// matches may be reported at the same position, so only the search + /// implementation itself knows when to advance the position.) /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. + /// # Errors /// - /// # Panics + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: /// - /// This routine must panic if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// It must also panic if the given haystack range is not valid. + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example: UTF-8 mode /// - /// # Example: prefilter + /// This examples demonstrates that UTF-8 mode applies to reverse + /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all + /// matches reported must correspond to valid UTF-8 spans. This includes + /// prohibiting zero-width matches that split a codepoint. /// - /// This example shows how to provide a prefilter for a pattern where all - /// matches start with a `z` byte. + /// UTF-8 mode is enabled by default. Notice below how the only zero-width + /// matches reported are those at UTF-8 boundaries: /// /// ``` /// use regex_automata::{ - /// dfa::{Automaton, dense}, - /// util::prefilter::{Candidate, Prefilter, Scanner, State}, - /// HalfMatch, + /// dfa::{dense::DFA, Automaton, OverlappingState}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, /// }; /// - /// #[derive(Debug)] - /// pub struct ZPrefilter; - /// - /// impl Prefilter for ZPrefilter { - /// fn next_candidate( - /// &self, - /// _: &mut State, - /// haystack: &[u8], - /// at: usize, - /// ) -> Candidate { - /// // Try changing b'z' to b'q' and observe this test fail since - /// // the prefilter will skip right over the match. - /// match haystack.iter().position(|&b| b == b'z') { - /// None => Candidate::None, - /// Some(i) => Candidate::PossibleStartOfMatch(at + i), - /// } - /// } + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true)) + /// .build_many(&[r"", r"☃"])?; /// - /// fn heap_bytes(&self) -> usize { - /// 0 + /// // Run the reverse DFA to collect all matches. + /// let input = Input::new("☃"); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// loop { + /// dfa.try_search_overlapping_rev(&input, &mut state)?; + /// match state.get_match() { + /// None => break, + /// Some(hm) => matches.push(hm), /// } /// } /// - /// let dfa = dense::DFA::new("z[0-9]{3}")?; - /// let haystack = "foobar z123 q123".as_bytes(); - /// // A scanner executes a prefilter while tracking some state that helps - /// // determine whether a prefilter is still "effective" or not. - /// let mut scanner = Scanner::new(&ZPrefilter); - /// - /// let expected = Some(HalfMatch::must(0, 11)); - /// let got = dfa.find_earliest_fwd_at( - /// Some(&mut scanner), - /// None, - /// haystack, - /// 0, - /// haystack.len(), - /// )?; - /// assert_eq!(expected, got); + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(1, 0), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// - /// # Example: specific pattern search - /// - /// This example shows how to build a multi-DFA that permits searching for - /// specific patterns. + /// Now let's look at the same example, but with UTF-8 mode on the + /// original NFA disabled (which results in disabling UTF-8 mode on the + /// DFA): /// /// ``` /// use regex_automata::{ - /// dfa::{Automaton, dense}, - /// HalfMatch, - /// PatternID, - /// }; - /// - /// let dfa = dense::Builder::new() - /// .configure(dense::Config::new().starts_for_each_pattern(true)) - /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; - /// let haystack = "foo123".as_bytes(); - /// - /// // Since we are using the default leftmost-first match and both - /// // patterns match at the same starting position, only the first pattern - /// // will be returned in this case when doing a search for any of the - /// // patterns. - /// let expected = Some(HalfMatch::must(0, 6)); - /// let got = dfa.find_earliest_fwd_at( - /// None, - /// None, - /// haystack, - /// 0, - /// haystack.len(), - /// )?; - /// assert_eq!(expected, got); - /// - /// // But if we want to check whether some other pattern matches, then we - /// // can provide its pattern ID. - /// let expected = Some(HalfMatch::must(1, 6)); - /// let got = dfa.find_earliest_fwd_at( - /// None, - /// Some(PatternID::must(1)), - /// haystack, - /// 0, - /// haystack.len(), - /// )?; - /// assert_eq!(expected, got); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - /// - /// # Example: specifying the bounds of a search - /// - /// This example shows how providing the bounds of a search can produce - /// different results than simply sub-slicing the haystack. - /// - /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, dense}, - /// HalfMatch, + /// dfa::{dense::DFA, Automaton, OverlappingState}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, /// }; /// - /// // N.B. We disable Unicode here so that we use a simple ASCII word - /// // boundary. Alternatively, we could enable heuristic support for - /// // Unicode word boundaries. - /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?; - /// let haystack = "foo123bar".as_bytes(); + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true).utf8(false)) + /// .build_many(&[r"", r"☃"])?; /// - /// // Since we sub-slice the haystack, the search doesn't know about the - /// // larger context and assumes that `123` is surrounded by word - /// // boundaries. And of course, the match position is reported relative - /// // to the sub-slice as well, which means we get `3` instead of `6`. - /// let expected = Some(HalfMatch::must(0, 3)); - /// let got = dfa.find_earliest_fwd_at( - /// None, - /// None, - /// &haystack[3..6], - /// 0, - /// haystack[3..6].len(), - /// )?; - /// assert_eq!(expected, got); + /// // Run the reverse DFA to collect all matches. + /// let input = Input::new("☃"); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// loop { + /// dfa.try_search_overlapping_rev(&input, &mut state)?; + /// match state.get_match() { + /// None => break, + /// Some(hm) => matches.push(hm), + /// } + /// } /// - /// // But if we provide the bounds of the search within the context of the - /// // entire haystack, then the search can take the surrounding context - /// // into account. (And if we did find a match, it would be reported - /// // as a valid offset into `haystack` instead of its sub-slice.) - /// let expected = None; - /// let got = dfa.find_earliest_fwd_at( - /// None, - /// None, - /// haystack, - /// 3, - /// 6, - /// )?; - /// assert_eq!(expected, got); + /// // Now *all* positions match, even within a codepoint, + /// // because we lifted the requirement that matches + /// // correspond to valid UTF-8 spans. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 2), + /// HalfMatch::must(0, 1), + /// HalfMatch::must(1, 0), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - fn find_earliest_fwd_at( + fn try_search_overlapping_rev( &self, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_earliest_fwd(pre, self, pattern_id, bytes, start, end) + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + let utf8empty = self.has_empty() && self.is_utf8(); + search::find_overlapping_rev(self, input, state)?; + match state.get_match() { + None => Ok(()), + Some(_) if !utf8empty => Ok(()), + Some(_) => skip_empty_utf8_splits_overlapping( + input, + state, + |input, state| { + search::find_overlapping_rev(self, input, state) + }, + ), + } } - /// Executes a reverse search and returns the start position of the first - /// match that is found as early as possible. If no match exists, then - /// `None` is returned. - /// - /// This routine stops scanning input as soon as the search observes a - /// match state. - /// - /// This is like [`Automaton::find_earliest_rev`], except it provides some - /// additional control over how the search is executed. See the - /// documentation of [`Automaton::find_earliest_fwd_at`] for more details - /// on the additional parameters along with examples of their usage. + /// Writes the set of patterns that match anywhere in the given search + /// configuration to `patset`. If multiple patterns match at the same + /// position and the underlying DFA supports overlapping matches, then all + /// matching patterns are written to the given set. /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. + /// Unless all of the patterns in this DFA are anchored, then generally + /// speaking, this will visit every byte in the haystack. /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. + /// This search routine *does not* clear the pattern set. This gives some + /// flexibility to the caller (e.g., running multiple searches with the + /// same pattern set), but does make the API bug-prone if you're reusing + /// the same pattern set for multiple searches but intended them to be + /// independent. /// - /// # Panics - /// - /// This routine must panic if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. - /// - /// It must also panic if the given haystack range is not valid. - #[inline] - fn find_earliest_rev_at( - &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_earliest_rev(self, pattern_id, bytes, start, end) - } - - /// Executes a forward search and returns the end position of the leftmost - /// match that is found. If no match exists, then `None` is returned. - /// - /// This is like [`Automaton::find_leftmost_fwd`], except it provides some - /// additional control over how the search is executed. See the - /// documentation of [`Automaton::find_earliest_fwd_at`] for more details - /// on the additional parameters along with examples of their usage. + /// If a pattern ID matched but the given `PatternSet` does not have + /// sufficient capacity to store it, then it is not inserted and silently + /// dropped. /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// # Panics - /// - /// This routine must panic if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. - /// - /// It must also panic if the given haystack range is not valid. - #[inline] - fn find_leftmost_fwd_at( - &self, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_leftmost_fwd(pre, self, pattern_id, bytes, start, end) - } - - /// Executes a reverse search and returns the start of the position of the - /// leftmost match that is found. If no match exists, then `None` is - /// returned. - /// - /// This is like [`Automaton::find_leftmost_rev`], except it provides some - /// additional control over how the search is executed. See the - /// documentation of [`Automaton::find_earliest_fwd_at`] for more details - /// on the additional parameters along with examples of their usage. - /// - /// # Errors + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// When a search cannot complete, callers cannot know whether a match + /// When a search returns an error, callers cannot know whether a match /// exists or not. /// - /// # Panics - /// - /// This routine must panic if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. - /// - /// It must also panic if the given haystack range is not valid. - #[inline] - fn find_leftmost_rev_at( - &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_leftmost_rev(self, pattern_id, bytes, start, end) - } - - /// Executes an overlapping forward search and returns the end position of - /// matches as they are found. If no match exists, then `None` is returned. - /// - /// This routine is principally only useful when searching for multiple - /// patterns on inputs where multiple patterns may match the same regions - /// of text. In particular, callers must preserve the automaton's search - /// state from prior calls so that the implementation knows where the last - /// match occurred. - /// - /// This is like [`Automaton::find_overlapping_fwd`], except it provides - /// some additional control over how the search is executed. See the - /// documentation of [`Automaton::find_earliest_fwd_at`] for more details - /// on the additional parameters along with examples of their usage. - /// - /// When using this routine to implement an iterator of overlapping - /// matches, the `start` of the search should always be set to the end - /// of the last match. If more patterns match at the previous location, - /// then they will be immediately returned. (This is tracked by the given - /// overlapping state.) Otherwise, the search continues at the starting - /// position given. - /// - /// If for some reason you want the search to forget about its previous - /// state and restart the search at a particular position, then setting the - /// state to [`OverlappingState::start`] will accomplish that. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFAs generated by this crate, this only occurs in a non-default - /// configuration where quit bytes are used or Unicode word boundaries are - /// heuristically enabled. + /// # Example /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. + /// This example shows how to find all matching patterns in a haystack, + /// even when some patterns match at the same position as other patterns. /// - /// # Panics + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::{Automaton, dense::DFA}, + /// Input, MatchKind, PatternSet, + /// }; /// - /// This routine must panic if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. + /// let patterns = &[ + /// r"[[:word:]]+", + /// r"[0-9]+", + /// r"[[:alpha:]]+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]; + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(dfa.pattern_len()); + /// dfa.try_which_overlapping_matches(&input, &mut patset)?; + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); /// - /// It must also panic if the given haystack range is not valid. + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "alloc")] #[inline] - fn find_overlapping_fwd_at( + fn try_which_overlapping_matches( &self, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_overlapping_fwd( - pre, self, pattern_id, bytes, start, end, state, - ) + input: &Input<'_>, + patset: &mut PatternSet, + ) -> Result<(), MatchError> { + let mut state = OverlappingState::start(); + while let Some(m) = { + self.try_search_overlapping_fwd(input, &mut state)?; + state.get_match() + } { + let _ = patset.insert(m.pattern()); + // There's nothing left to find, so we can stop. Or the caller + // asked us to. + if patset.is_full() || input.get_earliest() { + break; + } + } + Ok(()) } } -unsafe impl<'a, T: Automaton> Automaton for &'a T { +unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { #[inline] fn next_state(&self, current: StateID, input: u8) -> StateID { (**self).next_state(current, input) @@ -1619,23 +1801,22 @@ unsafe impl<'a, T: Automaton> Automaton for &'a T { #[inline] fn start_state_forward( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID { - (**self).start_state_forward(pattern_id, bytes, start, end) + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + (**self).start_state_forward(input) } #[inline] fn start_state_reverse( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID { - (**self).start_state_reverse(pattern_id, bytes, start, end) + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + (**self).start_state_reverse(input) + } + + #[inline] + fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { + (**self).universal_start_state(mode) } #[inline] @@ -1669,13 +1850,13 @@ unsafe impl<'a, T: Automaton> Automaton for &'a T { } #[inline] - fn pattern_count(&self) -> usize { - (**self).pattern_count() + fn pattern_len(&self) -> usize { + (**self).pattern_len() } #[inline] - fn match_count(&self, id: StateID) -> usize { - (**self).match_count(id) + fn match_len(&self, id: StateID) -> usize { + (**self).match_len(id) } #[inline] @@ -1684,109 +1865,72 @@ unsafe impl<'a, T: Automaton> Automaton for &'a T { } #[inline] - fn accelerator(&self, id: StateID) -> &[u8] { - (**self).accelerator(id) - } - - #[inline] - fn find_earliest_fwd( - &self, - bytes: &[u8], - ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_earliest_fwd(bytes) + fn has_empty(&self) -> bool { + (**self).has_empty() } #[inline] - fn find_earliest_rev( - &self, - bytes: &[u8], - ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_earliest_rev(bytes) + fn is_utf8(&self) -> bool { + (**self).is_utf8() } #[inline] - fn find_leftmost_fwd( - &self, - bytes: &[u8], - ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_leftmost_fwd(bytes) + fn is_always_start_anchored(&self) -> bool { + (**self).is_always_start_anchored() } #[inline] - fn find_leftmost_rev( - &self, - bytes: &[u8], - ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_leftmost_rev(bytes) + fn accelerator(&self, id: StateID) -> &[u8] { + (**self).accelerator(id) } #[inline] - fn find_overlapping_fwd( - &self, - bytes: &[u8], - state: &mut OverlappingState, - ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_overlapping_fwd(bytes, state) + fn get_prefilter(&self) -> Option<&Prefilter> { + (**self).get_prefilter() } #[inline] - fn find_earliest_fwd_at( + fn try_search_fwd( &self, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_earliest_fwd_at(pre, pattern_id, bytes, start, end) + (**self).try_search_fwd(input) } #[inline] - fn find_earliest_rev_at( + fn try_search_rev( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_earliest_rev_at(pattern_id, bytes, start, end) + (**self).try_search_rev(input) } #[inline] - fn find_leftmost_fwd_at( + fn try_search_overlapping_fwd( &self, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_leftmost_fwd_at(pre, pattern_id, bytes, start, end) + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + (**self).try_search_overlapping_fwd(input, state) } #[inline] - fn find_leftmost_rev_at( + fn try_search_overlapping_rev( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - (**self).find_leftmost_rev_at(pattern_id, bytes, start, end) + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + (**self).try_search_overlapping_rev(input, state) } + #[cfg(feature = "alloc")] #[inline] - fn find_overlapping_fwd_at( + fn try_which_overlapping_matches( &self, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result<Option<HalfMatch>, MatchError> { - (**self) - .find_overlapping_fwd_at(pre, pattern_id, bytes, start, end, state) + input: &Input<'_>, + patset: &mut PatternSet, + ) -> Result<(), MatchError> { + (**self).try_which_overlapping_matches(input, patset) } } @@ -1799,15 +1943,21 @@ unsafe impl<'a, T: Automaton> Automaton for &'a T { /// the search at the next position. Additionally, it also tracks which state /// the last search call terminated in. /// -/// This type provides no introspection capabilities. The only thing a caller -/// can do is construct it and pass it around to permit search routines to use -/// it to track state. +/// This type provides little introspection capabilities. The only thing a +/// caller can do is construct it and pass it around to permit search routines +/// to use it to track state, and also ask whether a match has been found. /// /// Callers should always provide a fresh state constructed via /// [`OverlappingState::start`] when starting a new search. Reusing state from /// a previous search may result in incorrect results. #[derive(Clone, Debug, Eq, PartialEq)] pub struct OverlappingState { + /// The match reported by the most recent overlapping search to use this + /// state. + /// + /// If a search does not find any matches, then it is expected to clear + /// this value. + pub(crate) mat: Option<HalfMatch>, /// The state ID of the state at which the search was in when the call /// terminated. When this is a match state, `last_match` must be set to a /// non-None value. @@ -1816,50 +1966,96 @@ pub struct OverlappingState { /// automaton. We cannot use the actual ID, since any one automaton may /// have many start states, and which one is in use depends on several /// search-time factors. - id: Option<StateID>, - /// Information associated with a match when `id` corresponds to a match - /// state. - last_match: Option<StateMatch>, -} - -/// Internal state about the last match that occurred. This records both the -/// offset of the match and the match index. -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub(crate) struct StateMatch { - /// The index into the matching patterns for the current match state. - pub(crate) match_index: usize, - /// The offset in the haystack at which the match occurred. This is used - /// when reporting multiple matches at the same offset. That is, when - /// an overlapping search runs, the first thing it checks is whether it's - /// already in a match state, and if so, whether there are more patterns - /// to report as matches in that state. If so, it increments `match_index` - /// and returns the pattern and this offset. Once `match_index` exceeds the - /// number of matching patterns in the current state, the search continues. - pub(crate) offset: usize, + pub(crate) id: Option<StateID>, + /// The position of the search. + /// + /// When `id` is None (i.e., we are starting a search), this is set to + /// the beginning of the search as given by the caller regardless of its + /// current value. Subsequent calls to an overlapping search pick up at + /// this offset. + pub(crate) at: usize, + /// The index into the matching patterns of the next match to report if the + /// current state is a match state. Note that this may be 1 greater than + /// the total number of matches to report for the current match state. (In + /// which case, no more matches should be reported at the current position + /// and the search should advance to the next position.) + pub(crate) next_match_index: Option<usize>, + /// This is set to true when a reverse overlapping search has entered its + /// EOI transitions. + /// + /// This isn't used in a forward search because it knows to stop once the + /// position exceeds the end of the search range. In a reverse search, + /// since we use unsigned offsets, we don't "know" once we've gone past + /// `0`. So the only way to detect it is with this extra flag. The reverse + /// overlapping search knows to terminate specifically after it has + /// reported all matches after following the EOI transition. + pub(crate) rev_eoi: bool, } impl OverlappingState { /// Create a new overlapping state that begins at the start state of any /// automaton. pub fn start() -> OverlappingState { - OverlappingState { id: None, last_match: None } + OverlappingState { + mat: None, + id: None, + at: 0, + next_match_index: None, + rev_eoi: false, + } } - pub(crate) fn id(&self) -> Option<StateID> { - self.id + /// Return the match result of the most recent search to execute with this + /// state. + /// + /// A searches will clear this result automatically, such that if no + /// match is found, this will correctly report `None`. + pub fn get_match(&self) -> Option<HalfMatch> { + self.mat } +} - pub(crate) fn set_id(&mut self, id: StateID) { - self.id = Some(id); - } +/// Runs the given overlapping `search` function (forwards or backwards) until +/// a match is found whose offset does not split a codepoint. +/// +/// This is *not* always correct to call. It should only be called when the DFA +/// has UTF-8 mode enabled *and* it can produce zero-width matches. Calling +/// this when both of those things aren't true might result in legitimate +/// matches getting skipped. +#[cold] +#[inline(never)] +fn skip_empty_utf8_splits_overlapping<F>( + input: &Input<'_>, + state: &mut OverlappingState, + mut search: F, +) -> Result<(), MatchError> +where + F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>, +{ + // Note that this routine works for forwards and reverse searches + // even though there's no code here to handle those cases. That's + // because overlapping searches drive themselves to completion via + // `OverlappingState`. So all we have to do is push it until no matches are + // found. - pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> { - self.last_match.as_mut() + let mut hm = match state.get_match() { + None => return Ok(()), + Some(hm) => hm, + }; + if input.get_anchored().is_anchored() { + if !input.is_char_boundary(hm.offset()) { + state.mat = None; + } + return Ok(()); } - - pub(crate) fn set_last_match(&mut self, last_match: StateMatch) { - self.last_match = Some(last_match); + while !input.is_char_boundary(hm.offset()) { + search(input, state)?; + hm = match state.get_match() { + None => return Ok(()), + Some(hm) => hm, + }; } + Ok(()) } /// Write a prefix "state" indicator for fmt::Debug impls. @@ -1901,3 +2097,24 @@ pub(crate) fn fmt_state_indicator<A: Automaton>( } Ok(()) } + +#[cfg(all(test, feature = "syntax", feature = "dfa-build"))] +mod tests { + // A basic test ensuring that our Automaton trait is object safe. (This is + // the main reason why we don't define the search routines as generic over + // Into<Input>.) + #[test] + fn object_safe() { + use crate::{ + dfa::{dense, Automaton}, + HalfMatch, Input, + }; + + let dfa = dense::DFA::new("abc").unwrap(); + let dfa: &dyn Automaton = &dfa; + assert_eq!( + Ok(Some(HalfMatch::must(0, 6))), + dfa.try_search_fwd(&Input::new(b"xyzabcxyz")), + ); + } +} diff --git a/vendor/regex-automata/src/dfa/dense.rs b/vendor/regex-automata/src/dfa/dense.rs index 07c135098..6da865f97 100644 --- a/vendor/regex-automata/src/dfa/dense.rs +++ b/vendor/regex-automata/src/dfa/dense.rs @@ -4,41 +4,45 @@ Types and routines specific to dense DFAs. This module is the home of [`dense::DFA`](DFA). This module also contains a [`dense::Builder`](Builder) and a -[`dense::Config`](Config) for configuring and building a dense DFA. +[`dense::Config`](Config) for building and configuring a dense DFA. */ -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] use core::cmp; use core::{convert::TryFrom, fmt, iter, mem::size_of, slice}; -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] use alloc::{ collections::{BTreeMap, BTreeSet}, vec, vec::Vec, }; -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] use crate::{ dfa::{ - accel::Accel, determinize, error::Error, minimize::Minimizer, sparse, + accel::Accel, determinize, minimize::Minimizer, remapper::Remapper, + sparse, }, nfa::thompson, - util::alphabet::ByteSet, - MatchKind, + util::{look::LookMatcher, search::MatchKind}, }; use crate::{ dfa::{ accel::Accels, automaton::{fmt_state_indicator, Automaton}, special::Special, + start::StartKind, DEAD, }, util::{ - alphabet::{self, ByteClasses}, - bytes::{self, DeserializeError, Endian, SerializeError}, - id::{PatternID, StateID}, - start::Start, + alphabet::{self, ByteClasses, ByteSet}, + int::{Pointer, Usize}, + prefilter::Prefilter, + primitives::{PatternID, StateID}, + search::{Anchored, Input, MatchError}, + start::{Start, StartByteMap}, + wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -53,17 +57,19 @@ const VERSION: u32 = 2; /// The configuration used for compiling a dense DFA. /// +/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The +/// advantage of the former is that it often lets you avoid importing the +/// `Config` type directly. +/// /// A dense DFA configuration is a simple data object that is typically used /// with [`dense::Builder::configure`](self::Builder::configure). /// -/// The default configuration guarantees that a search will _never_ return a -/// [`MatchError`](crate::MatchError) for any haystack or pattern. Setting a -/// quit byte with [`Config::quit`] or enabling heuristic support for Unicode -/// word boundaries with [`Config::unicode_word_boundary`] can in turn cause a -/// search to return an error. See the corresponding configuration options for -/// more details on when those error conditions arise. -#[cfg(feature = "alloc")] -#[derive(Clone, Copy, Debug, Default)] +/// The default configuration guarantees that a search will never return +/// a "quit" error, although it is possible for a search to fail if +/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by +/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +#[cfg(feature = "dfa-build")] +#[derive(Clone, Debug, Default)] pub struct Config { // As with other configuration types in this crate, we put all our knobs // in options so that we can distinguish between "default" and "not set." @@ -72,123 +78,27 @@ pub struct Config { // 'overwrite' method. // // For docs on the fields below, see the corresponding method setters. - anchored: Option<bool>, accelerate: Option<bool>, + pre: Option<Option<Prefilter>>, minimize: Option<bool>, match_kind: Option<MatchKind>, + start_kind: Option<StartKind>, starts_for_each_pattern: Option<bool>, byte_classes: Option<bool>, unicode_word_boundary: Option<bool>, - quit: Option<ByteSet>, + quitset: Option<ByteSet>, + specialize_start_states: Option<bool>, dfa_size_limit: Option<Option<usize>>, determinize_size_limit: Option<Option<usize>>, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl Config { /// Return a new default dense DFA compiler configuration. pub fn new() -> Config { Config::default() } - /// Set whether matching must be anchored at the beginning of the input. - /// - /// When enabled, a match must begin at the start of a search. When - /// disabled, the DFA will act as if the pattern started with a `(?s:.)*?`, - /// which enables a match to appear anywhere. - /// - /// Note that if you want to run both anchored and unanchored - /// searches without building multiple automatons, you can enable the - /// [`Config::starts_for_each_pattern`] configuration instead. This will - /// permit unanchored any-pattern searches and pattern-specific anchored - /// searches. See the documentation for that configuration for an example. - /// - /// By default this is disabled. - /// - /// **WARNING:** this is subtly different than using a `^` at the start of - /// your regex. A `^` forces a regex to match exclusively at the start of - /// input, regardless of where you begin your search. In contrast, enabling - /// this option will allow your regex to match anywhere in your input, - /// but the match must start at the beginning of a search. (Most of the - /// higher level convenience search routines make "start of input" and - /// "start of search" equivalent, but some routines allow treating these as - /// orthogonal.) - /// - /// For example, consider the haystack `aba` and the following searches: - /// - /// 1. The regex `^a` is compiled with `anchored=false` and searches - /// `aba` starting at position `2`. Since `^` requires the match to - /// start at the beginning of the input and `2 > 0`, no match is found. - /// 2. The regex `a` is compiled with `anchored=true` and searches `aba` - /// starting at position `2`. This reports a match at `[2, 3]` since - /// the match starts where the search started. Since there is no `^`, - /// there is no requirement for the match to start at the beginning of - /// the input. - /// 3. The regex `a` is compiled with `anchored=true` and searches `aba` - /// starting at position `1`. Since `b` corresponds to position `1` and - /// since the regex is anchored, it finds no match. - /// 4. The regex `a` is compiled with `anchored=false` and searches `aba` - /// startting at position `1`. Since the regex is neither anchored nor - /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?` - /// prefix that permits it to match anywhere. Thus, it reports a match - /// at `[2, 3]`. - /// - /// # Example - /// - /// This demonstrates the differences between an anchored search and - /// a pattern that begins with `^` (as described in the above warning - /// message). - /// - /// ``` - /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; - /// - /// let haystack = "aba".as_bytes(); - /// - /// let dfa = dense::Builder::new() - /// .configure(dense::Config::new().anchored(false)) // default - /// .build(r"^a")?; - /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?; - /// // No match is found because 2 is not the beginning of the haystack, - /// // which is what ^ requires. - /// let expected = None; - /// assert_eq!(expected, got); - /// - /// let dfa = dense::Builder::new() - /// .configure(dense::Config::new().anchored(true)) - /// .build(r"a")?; - /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?; - /// // An anchored search can still match anywhere in the haystack, it just - /// // must begin at the start of the search which is '2' in this case. - /// let expected = Some(HalfMatch::must(0, 3)); - /// assert_eq!(expected, got); - /// - /// let dfa = dense::Builder::new() - /// .configure(dense::Config::new().anchored(true)) - /// .build(r"a")?; - /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?; - /// // No match is found since we start searching at offset 1 which - /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match - /// // is found. - /// let expected = None; - /// assert_eq!(expected, got); - /// - /// let dfa = dense::Builder::new() - /// .configure(dense::Config::new().anchored(false)) // default - /// .build(r"a")?; - /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?; - /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the - /// // pattern. Even though the search starts at 'b', the 'match anything' - /// // prefix allows the search to match 'a'. - /// let expected = Some(HalfMatch::must(0, 3)); - /// assert_eq!(expected, got); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn anchored(mut self, yes: bool) -> Config { - self.anchored = Some(yes); - self - } - /// Enable state acceleration. /// /// When enabled, DFA construction will analyze each state to determine @@ -212,6 +122,87 @@ impl Config { self } + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// Note that unless [`Config::specialize_start_states`] has been + /// explicitly set, then setting this will also enable (when `pre` is + /// `Some`) or disable (when `pre` is `None`) start state specialization. + /// This occurs because without start state specialization, a prefilter + /// is likely to be less effective. And without a prefilter, start state + /// specialization is usually pointless. + /// + /// **WARNING:** Note that prefilters are not preserved as part of + /// serialization. Serializing a DFA will drop its prefilter. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = DFA::builder() + /// .configure(DFA::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// Some(HalfMatch::must(0, 11)), + /// re.try_search_fwd(&input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton}, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = DFA::builder() + /// .configure(DFA::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// // No match reported even though there clearly is one! + /// None, + /// re.try_search_fwd(&input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); + if self.specialize_start_states.is_none() { + self.specialize_start_states = + Some(self.get_prefilter().is_some()); + } + self + } + /// Minimize the DFA. /// /// When enabled, the DFA built will be minimized such that it is as small @@ -283,20 +274,21 @@ impl Config { /// report overlapping matches. /// /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::{Automaton, OverlappingState, dense}, - /// HalfMatch, MatchKind, + /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().match_kind(MatchKind::All)) /// .build_many(&[r"\w+$", r"\S+$"])?; - /// let haystack = "@foo".as_bytes(); + /// let input = Input::new("@foo"); /// let mut state = OverlappingState::start(); /// /// let expected = Some(HalfMatch::must(1, 4)); - /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; - /// assert_eq!(expected, got); + /// dfa.try_search_overlapping_fwd(&input, &mut state)?; + /// assert_eq!(expected, state.get_match()); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first @@ -304,8 +296,8 @@ impl Config { /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(HalfMatch::must(0, 4)); - /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?; - /// assert_eq!(expected, got); + /// dfa.try_search_overlapping_fwd(&input, &mut state)?; + /// assert_eq!(expected, state.get_match()); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` @@ -322,21 +314,31 @@ impl Config { /// you, so it's usually not necessary to do this yourself. /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, MatchKind}; + /// use regex_automata::{ + /// dfa::{dense, Automaton, StartKind}, + /// nfa::thompson::NFA, + /// Anchored, HalfMatch, Input, MatchKind, + /// }; /// /// let haystack = "123foobar456".as_bytes(); - /// let pattern = r"[a-z]+"; + /// let pattern = r"[a-z]+r"; /// /// let dfa_fwd = dense::DFA::new(pattern)?; /// let dfa_rev = dense::Builder::new() + /// .thompson(NFA::config().reverse(true)) /// .configure(dense::Config::new() - /// .anchored(true) + /// // This isn't strictly necessary since both anchored and + /// // unanchored searches are supported by default. But since + /// // finding the start-of-match only requires anchored searches, + /// // we can get rid of the unanchored configuration and possibly + /// // slim down our DFA considerably. + /// .start_kind(StartKind::Anchored) /// .match_kind(MatchKind::All) /// ) /// .build(pattern)?; /// let expected_fwd = HalfMatch::must(0, 9); /// let expected_rev = HalfMatch::must(0, 3); - /// let got_fwd = dfa_fwd.find_leftmost_fwd(haystack)?.unwrap(); + /// let got_fwd = dfa_fwd.try_search_fwd(&Input::new(haystack))?.unwrap(); /// // Here we don't specify the pattern to search for since there's only /// // one pattern and we're doing a leftmost search. But if this were an /// // overlapping search, you'd need to specify the pattern that matched @@ -344,9 +346,10 @@ impl Config { /// // starting position of a match of some other pattern.) That in turn /// // requires building the reverse automaton with starts_for_each_pattern /// // enabled. Indeed, this is what Regex does internally. - /// let got_rev = dfa_rev.find_leftmost_rev_at( - /// None, haystack, 0, got_fwd.offset(), - /// )?.unwrap(); + /// let input = Input::new(haystack) + /// .range(..got_fwd.offset()) + /// .anchored(Anchored::Yes); + /// let got_rev = dfa_rev.try_search_rev(&input)?.unwrap(); /// assert_eq!(expected_fwd, got_fwd); /// assert_eq!(expected_rev, got_rev); /// @@ -357,6 +360,45 @@ impl Config { self } + /// The type of starting state configuration to use for a DFA. + /// + /// By default, the starting state configuration is [`StartKind::Both`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// dfa::{dense::DFA, Automaton, StartKind}, + /// Anchored, HalfMatch, Input, + /// }; + /// + /// let haystack = "quux foo123"; + /// let expected = HalfMatch::must(0, 11); + /// + /// // By default, DFAs support both anchored and unanchored searches. + /// let dfa = DFA::new(r"[0-9]+")?; + /// let input = Input::new(haystack); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); + /// + /// // But if we only need anchored searches, then we can build a DFA + /// // that only supports anchored searches. This leads to a smaller DFA + /// // (potentially significantly smaller in some cases), but a DFA that + /// // will panic if you try to use it with an unanchored search. + /// let dfa = DFA::builder() + /// .configure(DFA::config().start_kind(StartKind::Anchored)) + /// .build(r"[0-9]+")?; + /// let input = Input::new(haystack) + /// .range(8..) + /// .anchored(Anchored::Yes); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn start_kind(mut self, kind: StartKind) -> Config { + self.start_kind = Some(kind); + self + } + /// Whether to compile a separate start state for each pattern in the /// automaton. /// @@ -397,36 +439,36 @@ impl Config { /// /// ``` /// use regex_automata::{ - /// dfa::{Automaton, dense}, - /// HalfMatch, PatternID, + /// dfa::{dense, Automaton}, + /// Anchored, HalfMatch, PatternID, Input, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().starts_for_each_pattern(true)) /// .build(r"foo[0-9]+")?; - /// let haystack = b"quux foo123"; + /// let haystack = "quux foo123"; /// /// // Here's a normal unanchored search. Notice that we use 'None' for the /// // pattern ID. Since the DFA was built as an unanchored machine, it /// // use its default unanchored starting state. /// let expected = HalfMatch::must(0, 11); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( - /// None, None, haystack, 0, haystack.len(), - /// )?); + /// let input = Input::new(haystack); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); /// // But now if we explicitly specify the pattern to search ('0' being /// // the only pattern in the DFA), then it will use the starting state /// // for that specific pattern which is always anchored. Since the /// // pattern doesn't have a match at the beginning of the haystack, we /// // find nothing. - /// assert_eq!(None, dfa.find_leftmost_fwd_at( - /// None, Some(PatternID::must(0)), haystack, 0, haystack.len(), - /// )?); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(0))); + /// assert_eq!(None, dfa.try_search_fwd(&input)?); /// // And finally, an anchored search is not the same as putting a '^' at /// // beginning of the pattern. An anchored search can only match at the /// // beginning of the *search*, which we can change: - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( - /// None, Some(PatternID::must(0)), haystack, 5, haystack.len(), - /// )?); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(0))) + /// .range(5..); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` @@ -446,8 +488,8 @@ impl Config { /// in the DFA. For example, the pattern `[ab]+` has at least two /// equivalence classes: a set containing `a` and `b` and a set containing /// every byte except for `a` and `b`. `a` and `b` are in the same - /// equivalence classes because they never discriminate between a match - /// and a non-match. + /// equivalence class because they never discriminate between a match and a + /// non-match. /// /// The advantage of this map is that the size of the transition table /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to @@ -473,7 +515,7 @@ impl Config { /// When set, this will attempt to implement Unicode word boundaries as if /// they were ASCII word boundaries. This only works when the search input /// is ASCII only. If a non-ASCII byte is observed while searching, then a - /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned. + /// [`MatchError::quit`](crate::MatchError::quit) error is returned. /// /// A possible alternative to enabling this option is to simply use an /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this @@ -497,7 +539,7 @@ impl Config { /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds /// to using the `try_` suite of methods. Alternatively, if /// callers can guarantee that their input is ASCII only, then a - /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be + /// [`MatchError::quit`](crate::MatchError::quit) error will never be /// returned while searching. /// /// This is disabled by default. @@ -511,7 +553,7 @@ impl Config { /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, - /// HalfMatch, MatchError, MatchKind, + /// HalfMatch, Input, MatchError, /// }; /// /// let dfa = dense::Builder::new() @@ -520,9 +562,9 @@ impl Config { /// /// // The match occurs before the search ever observes the snowman /// // character, so no error occurs. - /// let haystack = "foo 123 ☃".as_bytes(); + /// let haystack = "foo 123 ☃".as_bytes(); /// let expected = Some(HalfMatch::must(0, 7)); - /// let got = dfa.find_leftmost_fwd(haystack)?; + /// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); /// /// // Notice that this search fails, even though the snowman character @@ -530,9 +572,23 @@ impl Config { /// // routines read one byte past the end of the search to account for /// // look-around, and indeed, this is required here to determine whether /// // the trailing \b matches. - /// let haystack = "foo 123☃".as_bytes(); - /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 }; - /// let got = dfa.find_leftmost_fwd(haystack); + /// let haystack = "foo 123 ☃".as_bytes(); + /// let expected = MatchError::quit(0xE2, 8); + /// let got = dfa.try_search_fwd(&Input::new(haystack)); + /// assert_eq!(Err(expected), got); + /// + /// // Another example is executing a search where the span of the haystack + /// // we specify is all ASCII, but there is non-ASCII just before it. This + /// // correctly also reports an error. + /// let input = Input::new("β123").range(2..); + /// let expected = MatchError::quit(0xB2, 1); + /// let got = dfa.try_search_fwd(&input); + /// assert_eq!(Err(expected), got); + /// + /// // And similarly for the trailing word boundary. + /// let input = Input::new("123β").range(..3); + /// let expected = MatchError::quit(0xCE, 3); + /// let got = dfa.try_search_fwd(&input); /// assert_eq!(Err(expected), got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -549,7 +605,7 @@ impl Config { /// Add a "quit" byte to the DFA. /// /// When a quit byte is seen during search time, then search will return - /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the + /// a [`MatchError::quit`](crate::MatchError::quit) error indicating the /// offset at which the search stopped. /// /// A quit byte will always overrule any other aspects of a regex. For @@ -591,10 +647,8 @@ impl Config { /// a user supplied pattern from matching across a line boundary. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, dense}, - /// HalfMatch, MatchError, - /// }; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchError}; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().quit(b'\n', true)) @@ -604,8 +658,8 @@ impl Config { /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. - /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; - /// let got = dfa.find_leftmost_fwd(haystack).unwrap_err(); + /// let expected = MatchError::quit(b'\n', 3); + /// let got = dfa.try_search_fwd(&Input::new(haystack)).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -617,17 +671,98 @@ impl Config { Unicode word boundaries are enabled" ); } - if self.quit.is_none() { - self.quit = Some(ByteSet::empty()); + if self.quitset.is_none() { + self.quitset = Some(ByteSet::empty()); } if yes { - self.quit.as_mut().unwrap().add(byte); + self.quitset.as_mut().unwrap().add(byte); } else { - self.quit.as_mut().unwrap().remove(byte); + self.quitset.as_mut().unwrap().remove(byte); } self } + /// Enable specializing start states in the DFA. + /// + /// When start states are specialized, an implementor of a search routine + /// using a lazy DFA can tell when the search has entered a starting state. + /// When start states aren't specialized, then it is impossible to know + /// whether the search has entered a start state. + /// + /// Ideally, this option wouldn't need to exist and we could always + /// specialize start states. The problem is that start states can be quite + /// active. This in turn means that an efficient search routine is likely + /// to ping-pong between a heavily optimized hot loop that handles most + /// states and to a less optimized specialized handling of start states. + /// This causes branches to get heavily mispredicted and overall can + /// materially decrease throughput. Therefore, specializing start states + /// should only be enabled when it is needed. + /// + /// Knowing whether a search is in a start state is typically useful when a + /// prefilter is active for the search. A prefilter is typically only run + /// when in a start state and a prefilter can greatly accelerate a search. + /// Therefore, the possible cost of specializing start states is worth it + /// in this case. Otherwise, if you have no prefilter, there is likely no + /// reason to specialize start states. + /// + /// This is disabled by default, but note that it is automatically + /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless + /// `specialize_start_states` has already been set, [`Config::prefilter`] + /// will automatically enable or disable it based on whether a prefilter + /// is present or not, respectively. This is done because a prefilter's + /// effectiveness is rooted in being executed whenever the DFA is in a + /// start state, and that's only possible to do when they are specialized. + /// + /// Note that it is plausibly reasonable to _disable_ this option + /// explicitly while _enabling_ a prefilter. In that case, a prefilter + /// will still be run at the beginning of a search, but never again. This + /// in theory could strike a good balance if you're in a situation where a + /// prefilter is likely to produce many false positive candidates. + /// + /// # Example + /// + /// This example shows how to enable start state specialization and then + /// shows how to check whether a state is a start state or not. + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().specialize_start_states(true)) + /// .build(r"[a-z]+")?; + /// + /// let haystack = "123 foobar 4567".as_bytes(); + /// let sid = dfa.start_state_forward(&Input::new(haystack))?; + /// // The ID returned by 'start_state_forward' will always be tagged as + /// // a start state when start state specialization is enabled. + /// assert!(dfa.is_special_state(sid)); + /// assert!(dfa.is_start_state(sid)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Compare the above with the default DFA configuration where start states + /// are _not_ specialized. In this case, the start state is not tagged at + /// all: + /// + /// ``` + /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input}; + /// + /// let dfa = DFA::new(r"[a-z]+")?; + /// + /// let haystack = "123 foobar 4567"; + /// let sid = dfa.start_state_forward(&Input::new(haystack))?; + /// // Start states are not special in the default configuration! + /// assert!(!dfa.is_special_state(sid)); + /// assert!(!dfa.is_start_state(sid)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn specialize_start_states(mut self, yes: bool) -> Config { + self.specialize_start_states = Some(yes); + self + } + /// Set a size limit on the total heap used by a DFA. /// /// This size limit is expressed in bytes and is applied during @@ -655,28 +790,63 @@ impl Config { /// can get. /// /// ``` - /// use regex_automata::dfa::{dense, Automaton}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// - /// // 3MB isn't enough! + /// // 6MB isn't enough! /// dense::Builder::new() - /// .configure(dense::Config::new().dfa_size_limit(Some(3_000_000))) + /// .configure(dense::Config::new().dfa_size_limit(Some(6_000_000))) /// .build(r"\w{20}") /// .unwrap_err(); /// - /// // ... but 4MB probably is! + /// // ... but 7MB probably is! /// // (Note that DFA sizes aren't necessarily stable between releases.) /// let dfa = dense::Builder::new() - /// .configure(dense::Config::new().dfa_size_limit(Some(4_000_000))) + /// .configure(dense::Config::new().dfa_size_limit(Some(7_000_000))) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); - /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some()); + /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// - /// While one needs a little more than 3MB to represent `\w{20}`, it - /// turns out that you only need a little more than 4KB to represent + /// While one needs a little more than 6MB to represent `\w{20}`, it + /// turns out that you only need a little more than 6KB to represent /// `(?-u:\w{20})`. So only use Unicode if you need it! + /// + /// As with [`Config::determinize_size_limit`], the size of a DFA is + /// influenced by other factors, such as what start state configurations + /// to support. For example, if you only need unanchored searches and not + /// anchored searches, then configuring the DFA to only support unanchored + /// searches can reduce its size. By default, DFAs support both unanchored + /// and anchored searches. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{dense, Automaton, StartKind}, Input}; + /// + /// // 3MB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new() + /// .dfa_size_limit(Some(3_000_000)) + /// .start_kind(StartKind::Unanchored) + /// ) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 4MB probably is! + /// // (Note that DFA sizes aren't necessarily stable between releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new() + /// .dfa_size_limit(Some(4_000_000)) + /// .start_kind(StartKind::Unanchored) + /// ) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config { self.dfa_size_limit = Some(bytes); self @@ -708,26 +878,68 @@ impl Config { /// is still not as much as the DFA itself.) /// /// ``` - /// use regex_automata::dfa::{dense, Automaton}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 + /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// - /// // 300KB isn't enough! + /// // 600KB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(300_000)) + /// .determinize_size_limit(Some(600_000)) + /// ) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 700KB probably is! + /// // (Note that auxiliary storage sizes aren't necessarily stable between + /// // releases.) + /// let dfa = dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(700_000)) + /// ) + /// .build(r"\w{20}")?; + /// let haystack = "A".repeat(20).into_bytes(); + /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Note that some parts of the configuration on a DFA can have a + /// big impact on how big the DFA is, and thus, how much memory is + /// used. For example, the default setting for [`Config::start_kind`] is + /// [`StartKind::Both`]. But if you only need an anchored search, for + /// example, then it can be much cheaper to build a DFA that only supports + /// anchored searches. (Running an unanchored search with it would panic.) + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 + /// use regex_automata::{ + /// dfa::{dense, Automaton, StartKind}, + /// Anchored, Input, + /// }; + /// + /// // 200KB isn't enough! + /// dense::Builder::new() + /// .configure(dense::Config::new() + /// .determinize_size_limit(Some(200_000)) + /// .start_kind(StartKind::Anchored) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// - /// // ... but 400KB probably is! + /// // ... but 300KB probably is! /// // (Note that auxiliary storage sizes aren't necessarily stable between /// // releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() - /// .determinize_size_limit(Some(400_000)) + /// .determinize_size_limit(Some(300_000)) + /// .start_kind(StartKind::Anchored) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); - /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some()); + /// let input = Input::new(&haystack).anchored(Anchored::Yes); + /// assert!(dfa.try_search_fwd(&input)?.is_some()); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` @@ -736,17 +948,17 @@ impl Config { self } - /// Returns whether this configuration has enabled anchored searches. - pub fn get_anchored(&self) -> bool { - self.anchored.unwrap_or(false) - } - /// Returns whether this configuration has enabled simple state /// acceleration. pub fn get_accelerate(&self) -> bool { self.accelerate.unwrap_or(true) } + /// Returns the prefilter attached to this configuration, if any. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() + } + /// Returns whether this configuration has enabled the expensive process /// of minimizing a DFA. pub fn get_minimize(&self) -> bool { @@ -758,6 +970,11 @@ impl Config { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } + /// Returns the starting state configuration for a DFA. + pub fn get_starts(&self) -> StartKind { + self.start_kind.unwrap_or(StartKind::Both) + } + /// Returns whether this configuration has enabled anchored starting states /// for every pattern in the DFA. pub fn get_starts_for_each_pattern(&self) -> bool { @@ -783,7 +1000,16 @@ impl Config { /// least one byte has this enabled, it is possible for a search to return /// an error. pub fn get_quit(&self, byte: u8) -> bool { - self.quit.map_or(false, |q| q.contains(byte)) + self.quitset.map_or(false, |q| q.contains(byte)) + } + + /// Returns whether this configuration will instruct the DFA to + /// "specialize" start states. When enabled, the DFA will mark start states + /// as "special" so that search routines using the DFA can detect when + /// it's in a start state and do some kind of optimization (like run a + /// prefilter). + pub fn get_specialize_start_states(&self) -> bool { + self.specialize_start_states.unwrap_or(false) } /// Returns the DFA size limit of this configuration if one was set. @@ -814,12 +1040,13 @@ impl Config { /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. - pub(crate) fn overwrite(self, o: Config) -> Config { + pub(crate) fn overwrite(&self, o: Config) -> Config { Config { - anchored: o.anchored.or(self.anchored), accelerate: o.accelerate.or(self.accelerate), + pre: o.pre.or_else(|| self.pre.clone()), minimize: o.minimize.or(self.minimize), match_kind: o.match_kind.or(self.match_kind), + start_kind: o.start_kind.or(self.start_kind), starts_for_each_pattern: o .starts_for_each_pattern .or(self.starts_for_each_pattern), @@ -827,7 +1054,10 @@ impl Config { unicode_word_boundary: o .unicode_word_boundary .or(self.unicode_word_boundary), - quit: o.quit.or(self.quit), + quitset: o.quitset.or(self.quitset), + specialize_start_states: o + .specialize_start_states + .or(self.specialize_start_states), dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), determinize_size_limit: o .determinize_size_limit @@ -878,44 +1108,42 @@ impl Config { /// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. /// * The pattern itself is permitted to match invalid UTF-8. For example, /// things like `[^a]` that match any byte except for `a` are permitted. -/// * Unanchored patterns can search through invalid UTF-8. That is, for -/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of -/// `(?s:.)*?`. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, -/// nfa::thompson, -/// HalfMatch, SyntaxConfig, +/// util::syntax, +/// HalfMatch, Input, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().minimize(false)) -/// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) -/// .thompson(thompson::Config::new().utf8(false)) +/// .syntax(syntax::Config::new().unicode(false).utf8(false)) /// .build(r"foo[^b]ar.*")?; /// /// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; /// let expected = Some(HalfMatch::must(0, 10)); -/// let got = dfa.find_leftmost_fwd(haystack)?; +/// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] #[derive(Clone, Debug)] pub struct Builder { config: Config, - thompson: thompson::Builder, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl Builder { /// Create a new dense DFA builder with the default configuration. pub fn new() -> Builder { Builder { config: Config::default(), - thompson: thompson::Builder::new(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), } } @@ -923,7 +1151,8 @@ impl Builder { /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. - pub fn build(&self, pattern: &str) -> Result<OwnedDFA, Error> { + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<OwnedDFA, BuildError> { self.build_many(&[pattern]) } @@ -931,11 +1160,22 @@ impl Builder { /// /// When matches are returned, the pattern ID corresponds to the index of /// the pattern in the slice given. + #[cfg(feature = "syntax")] pub fn build_many<P: AsRef<str>>( &self, patterns: &[P], - ) -> Result<OwnedDFA, Error> { - let nfa = self.thompson.build_many(patterns).map_err(Error::nfa)?; + ) -> Result<OwnedDFA, BuildError> { + let nfa = self + .thompson + .clone() + // We can always forcefully disable captures because DFAs do not + // support them. + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) + .build_many(patterns) + .map_err(BuildError::nfa)?; self.build_from_nfa(&nfa) } @@ -949,19 +1189,19 @@ impl Builder { /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, - /// nfa::thompson, - /// HalfMatch, + /// nfa::thompson::NFA, + /// HalfMatch, Input, /// }; /// /// let haystack = "foo123bar".as_bytes(); /// /// // This shows how to set non-default options for building an NFA. - /// let nfa = thompson::Builder::new() - /// .configure(thompson::Config::new().shrink(false)) + /// let nfa = NFA::compiler() + /// .configure(NFA::config().shrink(true)) /// .build(r"[0-9]+")?; /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?; /// let expected = Some(HalfMatch::must(0, 6)); - /// let got = dfa.find_leftmost_fwd(haystack)?; + /// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -969,13 +1209,13 @@ impl Builder { pub fn build_from_nfa( &self, nfa: &thompson::NFA, - ) -> Result<OwnedDFA, Error> { - let mut quit = self.config.quit.unwrap_or(ByteSet::empty()); + ) -> Result<OwnedDFA, BuildError> { + let mut quitset = self.config.quitset.unwrap_or(ByteSet::empty()); if self.config.get_unicode_word_boundary() - && nfa.has_word_boundary_unicode() + && nfa.look_set_any().contains_word_unicode() { for b in 0x80..=0xFF { - quit.add(b); + quitset.add(b); } } let classes = if !self.config.get_byte_classes() { @@ -990,8 +1230,13 @@ impl Builder { // It is important to distinguish any "quit" bytes from all other // bytes. Otherwise, a non-quit byte may end up in the same class // as a quit byte, and thus cause the DFA stop when it shouldn't. - if !quit.is_empty() { - set.add_set(&quit); + // + // Test case: + // + // regex-cli find hybrid regex -w @conn.json.1000x.log \ + // '^#' '\b10\.55\.182\.100\b' + if !quitset.is_empty() { + set.add_set(&quitset); } set.byte_classes() }; @@ -999,12 +1244,16 @@ impl Builder { let mut dfa = DFA::initial( classes, nfa.pattern_len(), + self.config.get_starts(), + nfa.look_matcher(), self.config.get_starts_for_each_pattern(), + self.config.get_prefilter().map(|p| p.clone()), + quitset, + Flags::from_nfa(&nfa), )?; determinize::Config::new() - .anchored(self.config.get_anchored()) .match_kind(self.config.get_match_kind()) - .quit(quit) + .quit(quitset) .dfa_size_limit(self.config.get_dfa_size_limit()) .determinize_size_limit(self.config.get_determinize_size_limit()) .run(nfa, &mut dfa)?; @@ -1014,6 +1263,16 @@ impl Builder { if self.config.get_accelerate() { dfa.accelerate(); } + // The state shuffling done before this point always assumes that start + // states should be marked as "special," even though it isn't the + // default configuration. State shuffling is complex enough as it is, + // so it's simpler to just "fix" our special state ID ranges to not + // include starting states after-the-fact. + if !self.config.get_specialize_start_states() { + dfa.special.set_no_special_start_states(); + } + // Look for and set the universal starting states. + dfa.set_universal_starts(); Ok(dfa) } @@ -1024,16 +1283,17 @@ impl Builder { } /// Set the syntax configuration for this builder using - /// [`SyntaxConfig`](crate::SyntaxConfig). + /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a DFA directly from a /// pattern. + #[cfg(feature = "syntax")] pub fn syntax( &mut self, - config: crate::util::syntax::SyntaxConfig, + config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self @@ -1048,13 +1308,14 @@ impl Builder { /// /// These settings only apply when constructing a DFA directly from a /// pattern. + #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl Default for Builder { fn default() -> Builder { Builder::new() @@ -1067,7 +1328,7 @@ impl Default for Builder { /// reason for making DFAs generic is no_std support, and more generally, /// making it possible to load a DFA from an arbitrary slice of bytes. #[cfg(feature = "alloc")] -pub(crate) type OwnedDFA = DFA<Vec<u32>>; +pub(crate) type OwnedDFA = DFA<alloc::vec::Vec<u32>>; /// A dense table-based deterministic finite automaton (DFA). /// @@ -1117,11 +1378,11 @@ pub(crate) type OwnedDFA = DFA<Vec<u32>>; /// for searching. For example: /// /// ``` -/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; +/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; /// let expected = HalfMatch::must(0, 8); -/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); +/// assert_eq!(Some(expected), dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[derive(Clone)] @@ -1143,7 +1404,7 @@ pub struct DFA<T> { /// a match exists, but _which_ patterns match. So we need to store the /// matching pattern IDs for each match state. We do this even when there /// is only one pattern for the sake of simplicity. In practice, this uses - /// up very little space for the case of on pattern. + /// up very little space for the case of one pattern. ms: MatchStates<T>, /// Information about which states are "special." Special states are states /// that are dead, quit, matching, starting or accelerated. For more info, @@ -1160,9 +1421,25 @@ pub struct DFA<T> { /// transition table. See dfa/special.rs for more details on how states are /// arranged. accels: Accels<T>, + /// Any prefilter attached to this DFA. + /// + /// Note that currently prefilters are not serialized. When deserializing + /// a DFA from bytes, this is always set to `None`. + pre: Option<Prefilter>, + /// The set of "quit" bytes for this DFA. + /// + /// This is only used when computing the start state for a particular + /// position in a haystack. Namely, in the case where there is a quit + /// byte immediately before the start of the search, this set needs to be + /// explicitly consulted. In all other cases, quit bytes are detected by + /// the DFA itself, by transitioning all quit bytes to a special "quit + /// state." + quitset: ByteSet, + /// Various flags describing the behavior of this DFA. + flags: Flags, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl OwnedDFA { /// Parse the given regular expression using a default configuration and /// return the corresponding DFA. @@ -1173,14 +1450,15 @@ impl OwnedDFA { /// # Example /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::new("foo[0-9]+bar")?; - /// let expected = HalfMatch::must(0, 11); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// let expected = Some(HalfMatch::must(0, 11)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn new(pattern: &str) -> Result<OwnedDFA, Error> { + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<OwnedDFA, BuildError> { Builder::new().build(pattern) } @@ -1193,35 +1471,38 @@ impl OwnedDFA { /// # Example /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?; - /// let expected = HalfMatch::must(1, 3); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// let expected = Some(HalfMatch::must(1, 3)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<OwnedDFA, Error> { + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<OwnedDFA, BuildError> { Builder::new().build_many(patterns) } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl OwnedDFA { /// Create a new DFA that matches every input. /// /// # Example /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::always_match()?; /// - /// let expected = HalfMatch::must(0, 0); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?); + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn always_match() -> Result<OwnedDFA, Error> { + pub fn always_match() -> Result<OwnedDFA, BuildError> { let nfa = thompson::NFA::always_match(); Builder::new().build_from_nfa(&nfa) } @@ -1231,38 +1512,65 @@ impl OwnedDFA { /// # Example /// /// ``` - /// use regex_automata::dfa::{Automaton, dense}; + /// use regex_automata::{dfa::{Automaton, dense}, Input}; /// /// let dfa = dense::DFA::never_match()?; - /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?); - /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?); + /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn never_match() -> Result<OwnedDFA, Error> { + pub fn never_match() -> Result<OwnedDFA, BuildError> { let nfa = thompson::NFA::never_match(); Builder::new().build_from_nfa(&nfa) } - /// Create an initial DFA with the given equivalence classes, pattern count - /// and whether anchored starting states are enabled for each pattern. An - /// initial DFA can be further mutated via determinization. + /// Create an initial DFA with the given equivalence classes, pattern + /// length and whether anchored starting states are enabled for each + /// pattern. An initial DFA can be further mutated via determinization. fn initial( classes: ByteClasses, - pattern_count: usize, + pattern_len: usize, + starts: StartKind, + lookm: &LookMatcher, starts_for_each_pattern: bool, - ) -> Result<OwnedDFA, Error> { - let start_pattern_count = - if starts_for_each_pattern { pattern_count } else { 0 }; + pre: Option<Prefilter>, + quitset: ByteSet, + flags: Flags, + ) -> Result<OwnedDFA, BuildError> { + let start_pattern_len = + if starts_for_each_pattern { Some(pattern_len) } else { None }; Ok(DFA { tt: TransitionTable::minimal(classes), - st: StartTable::dead(start_pattern_count)?, - ms: MatchStates::empty(pattern_count), + st: StartTable::dead(starts, lookm, start_pattern_len)?, + ms: MatchStates::empty(pattern_len), special: Special::new(), accels: Accels::empty(), + pre, + quitset, + flags, }) } } +#[cfg(feature = "dfa-build")] +impl DFA<&[u32]> { + /// Return a new default dense DFA compiler configuration. + /// + /// This is a convenience routine to avoid needing to import the [`Config`] + /// type when customizing the construction of a dense DFA. + pub fn config() -> Config { + Config::new() + } + + /// Create a new dense DFA builder with the default configuration. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + pub fn builder() -> Builder { + Builder::new() + } +} + impl<T: AsRef<[u32]>> DFA<T> { /// Cheaply return a borrowed version of this dense DFA. Specifically, /// the DFA returned always uses `&[u32]` for its transition table. @@ -1273,6 +1581,9 @@ impl<T: AsRef<[u32]>> DFA<T> { ms: self.ms.as_ref(), special: self.special, accels: self.accels(), + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, } } @@ -1289,20 +1600,57 @@ impl<T: AsRef<[u32]>> DFA<T> { ms: self.ms.to_owned(), special: self.special, accels: self.accels().to_owned(), + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, } } + /// Returns the starting state configuration for this DFA. + /// + /// The default is [`StartKind::Both`], which means the DFA supports both + /// unanchored and anchored searches. However, this can generally lead to + /// bigger DFAs. Therefore, a DFA might be compiled with support for just + /// unanchored or anchored searches. In that case, running a search with + /// an unsupported configuration will panic. + pub fn start_kind(&self) -> StartKind { + self.st.kind + } + + /// Returns the start byte map used for computing the `Start` configuration + /// at the beginning of a search. + pub(crate) fn start_map(&self) -> &StartByteMap { + &self.st.start_map + } + /// Returns true only if this DFA has starting states for each pattern. /// /// When a DFA has starting states for each pattern, then a search with the /// DFA can be configured to only look for anchored matches of a specific - /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`] - /// can accept a non-None `pattern_id` if and only if this method returns - /// true. Otherwise, calling `find_earliest_fwd_at` will panic. + /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can + /// accept a non-None `pattern_id` if and only if this method returns true. + /// Otherwise, calling `try_search_fwd` will panic. /// /// Note that if the DFA has no patterns, this always returns false. - pub fn has_starts_for_each_pattern(&self) -> bool { - self.st.patterns > 0 + pub fn starts_for_each_pattern(&self) -> bool { + self.st.pattern_len.is_some() + } + + /// Returns the equivalence classes that make up the alphabet for this DFA. + /// + /// Unless [`Config::byte_classes`] was disabled, it is possible that + /// multiple distinct bytes are grouped into the same equivalence class + /// if it is impossible for them to discriminate between a match and a + /// non-match. This has the effect of reducing the overall alphabet size + /// and in turn potentially substantially reducing the size of the DFA's + /// transition table. + /// + /// The downside of using equivalence classes like this is that every state + /// transition will automatically use this map to convert an arbitrary + /// byte to its corresponding equivalence class. In practice this has a + /// negligible impact on performance. + pub fn byte_classes(&self) -> &ByteClasses { + &self.tt.classes } /// Returns the total number of elements in the alphabet for this DFA. @@ -1368,27 +1716,6 @@ impl<T: AsRef<[u32]>> DFA<T> { self.tt.stride() } - /// Returns the "universal" start state for this DFA. - /// - /// A universal start state occurs only when all of the starting states - /// for this DFA are precisely the same. This occurs when there are no - /// look-around assertions at the beginning (or end for a reverse DFA) of - /// the pattern. - /// - /// Using this as a starting state for a DFA without a universal starting - /// state has unspecified behavior. This condition is not checked, so the - /// caller must guarantee it themselves. - pub(crate) fn universal_start_state(&self) -> StateID { - // We choose 'NonWordByte' for no particular reason, other than - // the fact that this is the 'main' starting configuration used in - // determinization. But in essence, it doesn't really matter. - // - // Also, we might consider exposing this routine, but it seems - // a little tricky to use correctly. Maybe if we also expose a - // 'has_universal_start_state' method? - self.st.start(Start::NonWordByte, None) - } - /// Returns the memory usage, in bytes, of this DFA. /// /// The memory usage is computed based on the number of bytes used to @@ -1417,17 +1744,17 @@ impl<T: AsRef<[u32]>> DFA<T> { /// # Example /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dense = dense::DFA::new("foo[0-9]+")?; /// let sparse = dense.to_sparse()?; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), sparse.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, sparse.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - #[cfg(feature = "alloc")] - pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, Error> { + #[cfg(feature = "dfa-build")] + pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, BuildError> { sparse::DFA::from_dense(self) } @@ -1453,7 +1780,7 @@ impl<T: AsRef<[u32]>> DFA<T> { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -1465,13 +1792,13 @@ impl<T: AsRef<[u32]>> DFA<T> { /// // ignore it. /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) { - self.to_bytes::<bytes::LE>() + self.to_bytes::<wire::LE>() } /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian @@ -1496,7 +1823,7 @@ impl<T: AsRef<[u32]>> DFA<T> { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -1508,13 +1835,13 @@ impl<T: AsRef<[u32]>> DFA<T> { /// // ignore it. /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) { - self.to_bytes::<bytes::BE>() + self.to_bytes::<wire::BE>() } /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian @@ -1548,7 +1875,7 @@ impl<T: AsRef<[u32]>> DFA<T> { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -1558,21 +1885,21 @@ impl<T: AsRef<[u32]>> DFA<T> { /// // ignore it. /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) { - self.to_bytes::<bytes::NE>() + self.to_bytes::<wire::NE>() } /// The implementation of the public `to_bytes` serialization methods, /// which is generic over endianness. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) { let len = self.write_to_len(); - let (mut buf, padding) = bytes::alloc_aligned_buffer::<u32>(len); + let (mut buf, padding) = wire::alloc_aligned_buffer::<u32>(len); // This should always succeed since the only possible serialization // error is providing a buffer that's too small, but we've ensured that // `buf` is big enough here. @@ -1607,27 +1934,35 @@ impl<T: AsRef<[u32]>> DFA<T> { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// - /// // Create a 4KB buffer on the stack to store our serialized DFA. - /// let mut buf = [0u8; 4 * (1<<10)]; + /// // Create a 4KB buffer on the stack to store our serialized DFA. We + /// // need to use a special type to force the alignment of our [u8; N] + /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing + /// // the DFA may fail because of an alignment mismatch. + /// #[repr(C)] + /// struct Aligned<B: ?Sized> { + /// _align: [u32; 0], + /// bytes: B, + /// } + /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; /// // N.B. We use native endianness here to make the example work, but /// // using write_to_little_endian would work on a little endian target. - /// let written = original_dfa.write_to_native_endian(&mut buf)?; - /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn write_to_little_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError> { - self.as_ref().write_to::<bytes::LE>(dst) + self.as_ref().write_to::<wire::LE>(dst) } /// Serialize this DFA as raw bytes to the given slice, in big endian @@ -1657,27 +1992,35 @@ impl<T: AsRef<[u32]>> DFA<T> { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// - /// // Create a 4KB buffer on the stack to store our serialized DFA. - /// let mut buf = [0u8; 4 * (1<<10)]; + /// // Create a 4KB buffer on the stack to store our serialized DFA. We + /// // need to use a special type to force the alignment of our [u8; N] + /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing + /// // the DFA may fail because of an alignment mismatch. + /// #[repr(C)] + /// struct Aligned<B: ?Sized> { + /// _align: [u32; 0], + /// bytes: B, + /// } + /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; /// // N.B. We use native endianness here to make the example work, but /// // using write_to_big_endian would work on a big endian target. - /// let written = original_dfa.write_to_native_endian(&mut buf)?; - /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn write_to_big_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError> { - self.as_ref().write_to::<bytes::BE>(dst) + self.as_ref().write_to::<wire::BE>(dst) } /// Serialize this DFA as raw bytes to the given slice, in native endian @@ -1716,25 +2059,33 @@ impl<T: AsRef<[u32]>> DFA<T> { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// - /// // Create a 4KB buffer on the stack to store our serialized DFA. - /// let mut buf = [0u8; 4 * (1<<10)]; - /// let written = original_dfa.write_to_native_endian(&mut buf)?; - /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// // Create a 4KB buffer on the stack to store our serialized DFA. We + /// // need to use a special type to force the alignment of our [u8; N] + /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing + /// // the DFA may fail because of an alignment mismatch. + /// #[repr(C)] + /// struct Aligned<B: ?Sized> { + /// _align: [u32; 0], + /// bytes: B, + /// } + /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; + /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; + /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn write_to_native_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError> { - self.as_ref().write_to::<bytes::NE>(dst) + self.as_ref().write_to::<wire::NE>(dst) } /// Return the total number of bytes required to serialize this DFA. @@ -1756,17 +2107,33 @@ impl<T: AsRef<[u32]>> DFA<T> { /// a DFA. /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// - /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// let mut buf = vec![0; original_dfa.write_to_len()]; - /// let written = original_dfa.write_to_native_endian(&mut buf)?; - /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0; + /// // This is guaranteed to succeed, because the only serialization error + /// // that can occur is when the provided buffer is too small. But + /// // write_to_len guarantees a correct size. + /// let written = original_dfa.write_to_native_endian(&mut buf).unwrap(); + /// // But this is not guaranteed to succeed! In particular, + /// // deserialization requires proper alignment for &[u32], but our buffer + /// // was allocated as a &[u8] whose required alignment is smaller than + /// // &[u32]. However, it's likely to work in practice because of how most + /// // allocators work. So if you write code like this, make sure to either + /// // handle the error correctly and/or run it under Miri since Miri will + /// // likely provoke the error by returning Vec<u8> buffers with alignment + /// // less than &[u32]. + /// let dfa: DFA<&[u32]> = match DFA::from_bytes(&buf[..written]) { + /// // As mentioned above, it is legal for an error to be returned + /// // here. It is quite difficult to get a Vec<u8> with a guaranteed + /// // alignment equivalent to Vec<u32>. + /// Err(_) => return Ok(()), + /// Ok((dfa, _)) => dfa, + /// }; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// @@ -1776,15 +2143,17 @@ impl<T: AsRef<[u32]>> DFA<T> { /// either need to deal with adding some initial padding yourself, or use /// one of the `to_bytes` methods, which will do it for you. pub fn write_to_len(&self) -> usize { - bytes::write_label_len(LABEL) - + bytes::write_endianness_check_len() - + bytes::write_version_len() + wire::write_label_len(LABEL) + + wire::write_endianness_check_len() + + wire::write_version_len() + size_of::<u32>() // unused, intended for future flexibility + + self.flags.write_to_len() + self.tt.write_to_len() + self.st.write_to_len() + self.ms.write_to_len() + self.special.write_to_len() + self.accels.write_to_len() + + self.quitset.write_to_len() } } @@ -1843,14 +2212,14 @@ impl<'a> DFA<&'a [u32]> { /// and then use it for searching. /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let (bytes, _) = initial.to_bytes_native_endian(); /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// @@ -1865,7 +2234,7 @@ impl<'a> DFA<&'a [u32]> { /// alternative way to write the above example: /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// // Serialization returns the number of leading padding bytes added to @@ -1873,8 +2242,8 @@ impl<'a> DFA<&'a [u32]> { /// let (bytes, pad) = initial.to_bytes_native_endian(); /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// @@ -1893,7 +2262,7 @@ impl<'a> DFA<&'a [u32]> { /// part is serializing the DFA to a file: /// /// ```no_run - /// use regex_automata::dfa::{Automaton, dense::DFA}; + /// use regex_automata::dfa::dense::DFA; /// /// let dfa = DFA::new("foo[0-9]+")?; /// @@ -1912,30 +2281,24 @@ impl<'a> DFA<&'a [u32]> { /// compilation to choose the correct endianness. /// /// ```no_run - /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch}; - /// - /// type S = u32; - /// type DFA = dense::DFA<&'static [S]>; - /// - /// fn get_foo() -> &'static DFA { - /// use std::cell::Cell; - /// use std::mem::MaybeUninit; - /// use std::sync::Once; - /// - /// // This struct with a generic B is used to permit unsizing - /// // coercions, specifically, where B winds up being a [u8]. We also - /// // need repr(C) to guarantee that _align comes first, which forces - /// // a correct alignment. - /// #[repr(C)] - /// struct Aligned<B: ?Sized> { - /// _align: [S; 0], - /// bytes: B, - /// } + /// use regex_automata::{ + /// dfa::{Automaton, dense::DFA}, + /// util::{lazy::Lazy, wire::AlignAs}, + /// HalfMatch, Input, + /// }; /// + /// // This crate provides its own "lazy" type, kind of like + /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc + /// // no-std environments and let's us write this using completely + /// // safe code. + /// static RE: Lazy<DFA<&'static [u32]>> = Lazy::new(|| { /// # const _: &str = stringify! { /// // This assignment is made possible (implicitly) via the - /// // CoerceUnsized trait. - /// static ALIGNED: &Aligned<[u8]> = &Aligned { + /// // CoerceUnsized trait. This is what guarantees that our + /// // bytes are stored in memory on a 4 byte boundary. You + /// // *must* do this or something equivalent for correct + /// // deserialization. + /// static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { /// _align: [], /// #[cfg(target_endian = "big")] /// bytes: *include_bytes!("foo.bigendian.dfa"), @@ -1943,55 +2306,40 @@ impl<'a> DFA<&'a [u32]> { /// bytes: *include_bytes!("foo.littleendian.dfa"), /// }; /// # }; - /// # static ALIGNED: &Aligned<[u8]> = &Aligned { + /// # static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { /// # _align: [], /// # bytes: [], /// # }; /// - /// struct Lazy(Cell<MaybeUninit<DFA>>); - /// // SAFETY: This is safe because DFA impls Sync. - /// unsafe impl Sync for Lazy {} - /// - /// static INIT: Once = Once::new(); - /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit())); - /// - /// INIT.call_once(|| { - /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) - /// .expect("serialized DFA should be valid"); - /// // SAFETY: This is guaranteed to only execute once, and all - /// // we do with the pointer is write the DFA to it. - /// unsafe { - /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa); - /// } - /// }); - /// // SAFETY: DFA is guaranteed to by initialized via INIT and is - /// // stored in static memory. - /// unsafe { - /// let dfa = (*DFA.0.as_ptr()).as_ptr(); - /// std::mem::transmute::<*const DFA, &'static DFA>(dfa) - /// } - /// } + /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) + /// .expect("serialized DFA should be valid"); + /// dfa + /// }); /// - /// let dfa = get_foo(); - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345")); + /// let expected = Ok(Some(HalfMatch::must(0, 8))); + /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); /// ``` /// - /// Alternatively, consider using - /// [`lazy_static`](https://crates.io/crates/lazy_static) - /// or - /// [`once_cell`](https://crates.io/crates/once_cell), - /// which will guarantee safety for you. You will still need to use the - /// `Aligned` trick above to force correct alignment, but this is safe to - /// do and `from_bytes` will return an error if you get it wrong. + /// An alternative to [`util::lazy::Lazy`](crate::util::lazy::Lazy) + /// is [`lazy_static`](https://crates.io/crates/lazy_static) or + /// [`once_cell`](https://crates.io/crates/once_cell), which provide + /// stronger guarantees (like the initialization function only being + /// executed once). And `once_cell` in particular provides a more + /// expressive API. But a `Lazy` value from this crate is likely just fine + /// in most circumstances. + /// + /// Note that regardless of which initialization method you use, you + /// will still need to use the [`AlignAs`](crate::util::wire::AlignAs) + /// trick above to force correct alignment, but this is safe to do and + /// `from_bytes` will return an error if you get it wrong. pub fn from_bytes( slice: &'a [u8], ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { - // SAFETY: This is safe because we validate both the transition table, - // start state ID list and the match states below. If either validation - // fails, then we return an error. + // SAFETY: This is safe because we validate the transition table, start + // table, match states and accelerators below. If any validation fails, + // then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.tt.validate()?; + dfa.tt.validate(&dfa.special)?; dfa.st.validate(&dfa.tt)?; dfa.ms.validate(&dfa)?; dfa.accels.validate()?; @@ -2015,7 +2363,7 @@ impl<'a> DFA<&'a [u32]> { /// # Example /// /// ``` - /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch}; + /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let (bytes, _) = initial.to_bytes_native_endian(); @@ -2023,8 +2371,8 @@ impl<'a> DFA<&'a [u32]> { /// // directly from a compatible serialization routine. /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub unsafe fn from_bytes_unchecked( @@ -2032,15 +2380,18 @@ impl<'a> DFA<&'a [u32]> { ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { let mut nr = 0; - nr += bytes::skip_initial_padding(slice); - bytes::check_alignment::<StateID>(&slice[nr..])?; - nr += bytes::read_label(&slice[nr..], LABEL)?; - nr += bytes::read_endianness_check(&slice[nr..])?; - nr += bytes::read_version(&slice[nr..], VERSION)?; + nr += wire::skip_initial_padding(slice); + wire::check_alignment::<StateID>(&slice[nr..])?; + nr += wire::read_label(&slice[nr..], LABEL)?; + nr += wire::read_endianness_check(&slice[nr..])?; + nr += wire::read_version(&slice[nr..], VERSION)?; - let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?; + let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; nr += size_of::<u32>(); + let (flags, nread) = Flags::from_bytes(&slice[nr..])?; + nr += nread; + let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?; nr += nread; @@ -2052,12 +2403,17 @@ impl<'a> DFA<&'a [u32]> { let (special, nread) = Special::from_bytes(&slice[nr..])?; nr += nread; - special.validate_state_count(tt.count(), tt.stride2)?; + special.validate_state_len(tt.len(), tt.stride2)?; let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?; nr += nread; - Ok((DFA { tt, st, ms, special, accels }, nr)) + let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; + nr += nread; + + // Prefilters don't support serialization, so they're always absent. + let pre = None; + Ok((DFA { tt, st, ms, special, accels, pre, quitset, flags }, nr)) } /// The implementation of the public `write_to` serialization methods, @@ -2075,39 +2431,41 @@ impl<'a> DFA<&'a [u32]> { dst = &mut dst[..nwrite]; let mut nw = 0; - nw += bytes::write_label(LABEL, &mut dst[nw..])?; - nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?; - nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?; + nw += wire::write_label(LABEL, &mut dst[nw..])?; + nw += wire::write_endianness_check::<E>(&mut dst[nw..])?; + nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?; nw += { // Currently unused, intended for future flexibility E::write_u32(0, &mut dst[nw..]); size_of::<u32>() }; + nw += self.flags.write_to::<E>(&mut dst[nw..])?; nw += self.tt.write_to::<E>(&mut dst[nw..])?; nw += self.st.write_to::<E>(&mut dst[nw..])?; nw += self.ms.write_to::<E>(&mut dst[nw..])?; nw += self.special.write_to::<E>(&mut dst[nw..])?; nw += self.accels.write_to::<E>(&mut dst[nw..])?; + nw += self.quitset.write_to::<E>(&mut dst[nw..])?; Ok(nw) } } -/// The following methods implement mutable routines on the internal -/// representation of a DFA. As such, we must fix the first type parameter to a -/// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We -/// can get away with this because these methods are internal to the crate and -/// are exclusively used during construction of the DFA. -#[cfg(feature = "alloc")] +// The following methods implement mutable routines on the internal +// representation of a DFA. As such, we must fix the first type parameter to a +// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We +// can get away with this because these methods are internal to the crate and +// are exclusively used during construction of the DFA. +#[cfg(feature = "dfa-build")] impl OwnedDFA { /// Add a start state of this DFA. pub(crate) fn set_start_state( &mut self, - index: Start, - pattern_id: Option<PatternID>, + anchored: Anchored, + start: Start, id: StateID, ) { assert!(self.tt.is_valid(id), "invalid start state"); - self.st.set_start(index, pattern_id, id); + self.st.set_start(anchored, start, id); } /// Set the given transition to this DFA. Both the `from` and `to` states @@ -2127,7 +2485,7 @@ impl OwnedDFA { /// /// If adding a state would exceed `StateID::LIMIT`, then this returns an /// error. - pub(crate) fn add_empty_state(&mut self) -> Result<StateID, Error> { + pub(crate) fn add_empty_state(&mut self) -> Result<StateID, BuildError> { self.tt.add_empty_state() } @@ -2140,20 +2498,42 @@ impl OwnedDFA { self.tt.swap(id1, id2); } - /// Truncate the states in this DFA to the given count. + /// Remap all of the state identifiers in this DFA according to the map + /// function given. This includes all transitions and all starting state + /// identifiers. + pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + // We could loop over each state ID and call 'remap_state' here, but + // this is more direct: just map every transition directly. This + // technically might do a little extra work since the alphabet length + // is likely less than the stride, but if that is indeed an issue we + // should benchmark it and fix it. + for sid in self.tt.table_mut().iter_mut() { + *sid = map(*sid); + } + for sid in self.st.table_mut().iter_mut() { + *sid = map(*sid); + } + } + + /// Remap the transitions for the state given according to the function + /// given. This applies the given map function to every transition in the + /// given state and changes the transition in place to the result of the + /// map function for that transition. + pub(crate) fn remap_state( + &mut self, + id: StateID, + map: impl Fn(StateID) -> StateID, + ) { + self.tt.remap(id, map); + } + + /// Truncate the states in this DFA to the given length. /// /// This routine does not do anything to check the correctness of this /// truncation. Callers must ensure that other states pointing to truncated /// states are updated appropriately. - pub(crate) fn truncate_states(&mut self, count: usize) { - self.tt.truncate(count); - } - - /// Return a mutable representation of the state corresponding to the given - /// id. This is useful for implementing routines that manipulate DFA states - /// (e.g., swapping states). - pub(crate) fn state_mut(&mut self, id: StateID) -> StateMut<'_> { - self.tt.state_mut(id) + pub(crate) fn truncate_states(&mut self, len: usize) { + self.tt.truncate(len); } /// Minimize this DFA in place using Hopcroft's algorithm. @@ -2171,7 +2551,7 @@ impl OwnedDFA { pub(crate) fn set_pattern_map( &mut self, map: &BTreeMap<StateID, Vec<PatternID>>, - ) -> Result<(), Error> { + ) -> Result<(), BuildError> { self.ms = self.ms.new_with_map(map)?; Ok(()) } @@ -2180,7 +2560,7 @@ impl OwnedDFA { /// them as candidates for acceleration during search. pub(crate) fn accelerate(&mut self) { // dead and quit states can never be accelerated. - if self.state_count() <= 2 { + if self.state_len() <= 2 { return; } @@ -2191,6 +2571,11 @@ impl OwnedDFA { let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0); for state in self.states() { if let Some(accel) = state.accelerate(self.byte_classes()) { + debug!( + "accelerating full DFA state {}: {:?}", + state.id().as_usize(), + accel, + ); accels.insert(state.id(), accel); if self.is_match_state(state.id()) { cmatch += 1; @@ -2212,7 +2597,7 @@ impl OwnedDFA { // A remapper keeps track of state ID changes. Once we're done // shuffling, the remapper is used to rewrite all transitions in the // DFA based on the new positions of states. - let mut remapper = Remapper::from_dfa(self); + let mut remapper = Remapper::new(self); // As we swap states, if they are match states, we need to swap their // pattern ID lists too (for multi-regexes). We do this by converting @@ -2295,7 +2680,7 @@ impl OwnedDFA { if cnormal > 0 { // our next available starting and normal states for swapping. let mut next_start_id = self.special.min_start; - let mut cur_id = self.from_index(self.state_count() - 1); + let mut cur_id = self.to_state_id(self.state_len() - 1); // This is guaranteed to exist since cnormal > 0. let mut next_norm_id = self.tt.next_state_id(self.special.max_start); @@ -2361,9 +2746,9 @@ impl OwnedDFA { self.special.set_max(); self.special.validate().expect("special state ranges should validate"); self.special - .validate_state_count(self.state_count(), self.stride2()) + .validate_state_len(self.state_len(), self.stride2()) .expect( - "special state ranges should be consistent with state count", + "special state ranges should be consistent with state length", ); assert_eq!( self.special.accel_len(self.stride()), @@ -2395,36 +2780,29 @@ impl OwnedDFA { pub(crate) fn shuffle( &mut self, mut matches: BTreeMap<StateID, Vec<PatternID>>, - ) -> Result<(), Error> { + ) -> Result<(), BuildError> { // The determinizer always adds a quit state and it is always second. - self.special.quit_id = self.from_index(1); + self.special.quit_id = self.to_state_id(1); // If all we have are the dead and quit states, then we're done and // the DFA will never produce a match. - if self.state_count() <= 2 { + if self.state_len() <= 2 { self.special.set_max(); return Ok(()); } - // Collect all our start states into a convenient set and confirm there - // is no overlap with match states. In the classicl DFA construction, - // start states can be match states. But because of look-around, we - // delay all matches by a byte, which prevents start states from being - // match states. + // Collect all our non-DEAD start states into a convenient set and + // confirm there is no overlap with match states. In the classicl DFA + // construction, start states can be match states. But because of + // look-around, we delay all matches by a byte, which prevents start + // states from being match states. let mut is_start: BTreeSet<StateID> = BTreeSet::new(); for (start_id, _, _) in self.starts() { - // While there's nothing theoretically wrong with setting a start - // state to a dead ID (indeed, it could be an optimization!), the - // shuffling code below assumes that start states aren't dead. If - // this assumption is violated, the dead state could be shuffled - // to a new location, which must never happen. So if we do want - // to allow start states to be dead, then this assert should be - // removed and the code below fixed. - // - // N.B. Minimization can cause start states to be dead, but that - // happens after states are shuffled, so it's OK. Also, start - // states are dead for the DFA that never matches anything, but - // in that case, there are no states to shuffle. - assert_ne!(start_id, DEAD, "start state cannot be dead"); + // If a starting configuration points to a DEAD state, then we + // don't want to shuffle it. The DEAD state is always the first + // state with ID=0. So we can just leave it be. + if start_id == DEAD { + continue; + } assert!( !matches.contains_key(&start_id), "{:?} is both a start and a match state, which is not allowed", @@ -2438,7 +2816,7 @@ impl OwnedDFA { // IDs and swapping them changes their IDs, we need to record every // swap we make so that we can remap IDs. The remapper handles this // book-keeping for us. - let mut remapper = Remapper::from_dfa(self); + let mut remapper = Remapper::new(self); // Shuffle matching states. if matches.is_empty() { @@ -2448,7 +2826,7 @@ impl OwnedDFA { // The determinizer guarantees that the first two states are the // dead and quit states, respectively. We want our match states to // come right after quit. - let mut next_id = self.from_index(2); + let mut next_id = self.to_state_id(2); let mut new_matches = BTreeMap::new(); self.special.min_match = next_id; for (id, pids) in matches { @@ -2470,7 +2848,7 @@ impl OwnedDFA { // Shuffle starting states. { - let mut next_id = self.from_index(2); + let mut next_id = self.to_state_id(2); if self.special.matches() { next_id = self.tt.next_state_id(self.special.max_match); } @@ -2491,32 +2869,77 @@ impl OwnedDFA { self.special.set_max(); self.special.validate().expect("special state ranges should validate"); self.special - .validate_state_count(self.state_count(), self.stride2()) + .validate_state_len(self.state_len(), self.stride2()) .expect( - "special state ranges should be consistent with state count", + "special state ranges should be consistent with state length", ); Ok(()) } -} -/// A variety of generic internal methods for accessing DFA internals. -impl<T: AsRef<[u32]>> DFA<T> { - /// Return the byte classes used by this DFA. - pub(crate) fn byte_classes(&self) -> &ByteClasses { - &self.tt.classes + /// Checks whether there are universal start states (both anchored and + /// unanchored), and if so, sets the relevant fields to the start state + /// IDs. + /// + /// Universal start states occur precisely when the all patterns in the + /// DFA have no look-around assertions in their prefix. + fn set_universal_starts(&mut self) { + assert_eq!(6, Start::len(), "expected 6 start configurations"); + + let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { + // This OK because we only call 'start' under conditions + // in which we know it will succeed. + dfa.st.start(inp, start).expect("valid Input configuration") + }; + if self.start_kind().has_unanchored() { + let inp = Input::new("").anchored(Anchored::No); + let sid = start_id(self, &inp, Start::NonWordByte); + if sid == start_id(self, &inp, Start::WordByte) + && sid == start_id(self, &inp, Start::Text) + && sid == start_id(self, &inp, Start::LineLF) + && sid == start_id(self, &inp, Start::LineCR) + && sid == start_id(self, &inp, Start::CustomLineTerminator) + { + self.st.universal_start_unanchored = Some(sid); + } + } + if self.start_kind().has_anchored() { + let inp = Input::new("").anchored(Anchored::Yes); + let sid = start_id(self, &inp, Start::NonWordByte); + if sid == start_id(self, &inp, Start::WordByte) + && sid == start_id(self, &inp, Start::Text) + && sid == start_id(self, &inp, Start::LineLF) + && sid == start_id(self, &inp, Start::LineCR) + && sid == start_id(self, &inp, Start::CustomLineTerminator) + { + self.st.universal_start_anchored = Some(sid); + } + } } +} +// A variety of generic internal methods for accessing DFA internals. +impl<T: AsRef<[u32]>> DFA<T> { /// Return the info about special states. pub(crate) fn special(&self) -> &Special { &self.special } /// Return the info about special states as a mutable borrow. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub(crate) fn special_mut(&mut self) -> &mut Special { &mut self.special } + /// Returns the quit set (may be empty) used by this DFA. + pub(crate) fn quitset(&self) -> &ByteSet { + &self.quitset + } + + /// Returns the flags for this DFA. + pub(crate) fn flags(&self) -> &Flags { + &self.flags + } + /// Returns an iterator over all states in this DFA. /// /// This iterator yields a tuple for each state. The first element of the @@ -2528,14 +2951,14 @@ impl<T: AsRef<[u32]>> DFA<T> { /// Return the total number of states in this DFA. Every DFA has at least /// 1 state, even the empty DFA. - pub(crate) fn state_count(&self) -> usize { - self.tt.count() + pub(crate) fn state_len(&self) -> usize { + self.tt.len() } /// Return an iterator over all pattern IDs for the given match state. /// /// If the given state is not a match state, then this panics. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] { assert!(self.is_match_state(id)); self.ms.pattern_id_slice(self.match_state_index(id)) @@ -2550,21 +2973,21 @@ impl<T: AsRef<[u32]>> DFA<T> { } /// Returns the total number of patterns matched by this DFA. - pub(crate) fn pattern_count(&self) -> usize { - self.ms.patterns + pub(crate) fn pattern_len(&self) -> usize { + self.ms.pattern_len } /// Returns a map from match state ID to a list of pattern IDs that match /// in that state. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> { self.ms.to_map(self) } /// Returns the ID of the quit state for this DFA. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub(crate) fn quit_id(&self) -> StateID { - self.from_index(1) + self.to_state_id(1) } /// Convert the given state identifier to the state's index. The state's @@ -2576,14 +2999,14 @@ impl<T: AsRef<[u32]>> DFA<T> { self.tt.to_index(id) } - /// Convert an index to a state (in the range 0..self.state_count()) to an + /// Convert an index to a state (in the range 0..self.state_len()) to an /// actual state identifier. /// /// This is useful when using a `Vec<T>` as an efficient map keyed by state /// to some other information (such as a remapped state ID). - #[cfg(feature = "alloc")] - pub(crate) fn from_index(&self, index: usize) -> StateID { - self.tt.from_index(index) + #[cfg(feature = "dfa-build")] + pub(crate) fn to_state_id(&self, index: usize) -> StateID { + self.tt.to_state_id(index) } /// Return the table of state IDs for this DFA's start states. @@ -2594,11 +3017,12 @@ impl<T: AsRef<[u32]>> DFA<T> { /// Returns the index of the match state for the given ID. If the /// given ID does not correspond to a match state, then this may /// panic or produce an incorrect result. + #[cfg_attr(feature = "perf-inline", inline(always))] fn match_state_index(&self, id: StateID) -> usize { debug_assert!(self.is_match_state(id)); // This is one of the places where we rely on the fact that match // states are contiguous in the transition table. Namely, that the - // first match state ID always corresponds to dfa.special.min_start. + // first match state ID always corresponds to dfa.special.min_match. // From there, since we know the stride, we can compute the overall // index of any match state given the match state's ID. let min = self.special().min_match.as_usize(); @@ -2645,25 +3069,26 @@ impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> { write!(f, "\n")?; } writeln!(f, "")?; - for (i, (start_id, sty, pid)) in self.starts().enumerate() { + for (i, (start_id, anchored, sty)) in self.starts().enumerate() { let id = if f.alternate() { start_id.as_usize() } else { self.to_index(start_id) }; if i % self.st.stride == 0 { - match pid { - None => writeln!(f, "START-GROUP(ALL)")?, - Some(pid) => { + match anchored { + Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, + Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, + Anchored::Pattern(pid) => { writeln!(f, "START_GROUP(pattern: {:?})", pid)? } } } writeln!(f, " {:?} => {:06?}", sty, id)?; } - if self.pattern_count() > 1 { + if self.pattern_len() > 1 { writeln!(f, "")?; - for i in 0..self.ms.count() { + for i in 0..self.ms.len() { let id = self.ms.match_state_id(self, i); let id = if f.alternate() { id.as_usize() @@ -2681,124 +3106,168 @@ impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> { writeln!(f, "")?; } } - writeln!(f, "state count: {:?}", self.state_count())?; - writeln!(f, "pattern count: {:?}", self.pattern_count())?; + writeln!(f, "state length: {:?}", self.state_len())?; + writeln!(f, "pattern length: {:?}", self.pattern_len())?; + writeln!(f, "flags: {:?}", self.flags)?; writeln!(f, ")")?; Ok(()) } } +// SAFETY: We assert that our implementation of each method is correct. unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> { - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_special_state(&self, id: StateID) -> bool { self.special.is_special_state(id) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_dead_state(&self, id: StateID) -> bool { self.special.is_dead_state(id) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_quit_state(&self, id: StateID) -> bool { self.special.is_quit_state(id) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match_state(&self, id: StateID) -> bool { self.special.is_match_state(id) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_start_state(&self, id: StateID) -> bool { self.special.is_start_state(id) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn is_accel_state(&self, id: StateID) -> bool { self.special.is_accel_state(id) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn next_state(&self, current: StateID, input: u8) -> StateID { let input = self.byte_classes().get(input); let o = current.as_usize() + usize::from(input); self.trans()[o] } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] unsafe fn next_state_unchecked( &self, current: StateID, - input: u8, + byte: u8, ) -> StateID { - let input = self.byte_classes().get_unchecked(input); - let o = current.as_usize() + usize::from(input); - *self.trans().get_unchecked(o) + // We don't (or shouldn't) need an unchecked variant for the byte + // class mapping, since bound checks should be omitted automatically + // by virtue of its representation. If this ends up not being true as + // confirmed by codegen, please file an issue. ---AG + let class = self.byte_classes().get(byte); + let o = current.as_usize() + usize::from(class); + let next = *self.trans().get_unchecked(o); + next } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn next_eoi_state(&self, current: StateID) -> StateID { let eoi = self.byte_classes().eoi().as_usize(); let o = current.as_usize() + eoi; self.trans()[o] } - #[inline] - fn pattern_count(&self) -> usize { - self.ms.patterns + #[cfg_attr(feature = "perf-inline", inline(always))] + fn pattern_len(&self) -> usize { + self.ms.pattern_len } - #[inline] - fn match_count(&self, id: StateID) -> usize { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn match_len(&self, id: StateID) -> usize { self.match_pattern_len(id) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { // This is an optimization for the very common case of a DFA with a // single pattern. This conditional avoids a somewhat more costly path // that finds the pattern ID from the state machine, which requires // a bit of slicing/pointer-chasing. This optimization tends to only // matter when matches are frequent. - if self.ms.patterns == 1 { + if self.ms.pattern_len == 1 { return PatternID::ZERO; } let state_index = self.match_state_index(id); self.ms.pattern_id(state_index, match_index) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn has_empty(&self) -> bool { + self.flags.has_empty + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_utf8(&self) -> bool { + self.flags.is_utf8 + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_always_start_anchored(&self) -> bool { + self.flags.is_always_start_anchored + } + + #[cfg_attr(feature = "perf-inline", inline(always))] fn start_state_forward( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID { - let index = Start::from_position_fwd(bytes, start, end); - self.st.start(index, pattern_id) + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + if !self.quitset.is_empty() && input.start() > 0 { + let offset = input.start() - 1; + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.fwd(&input); + self.st.start(input, start) } - #[inline] + #[cfg_attr(feature = "perf-inline", inline(always))] fn start_state_reverse( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID { - let index = Start::from_position_rev(bytes, start, end); - self.st.start(index, pattern_id) + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + if !self.quitset.is_empty() && input.end() < input.haystack().len() { + let offset = input.end(); + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.rev(&input); + self.st.start(input, start) } - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { + match mode { + Anchored::No => self.st.universal_start_unanchored, + Anchored::Yes => self.st.universal_start_anchored, + Anchored::Pattern(_) => None, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] fn accelerator(&self, id: StateID) -> &[u8] { if !self.is_accel_state(id) { return &[]; } self.accels.needles(self.accelerator_index(id)) } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref() + } } /// The transition table portion of a dense DFA. @@ -2873,7 +3342,7 @@ impl<'a> TransitionTable<&'a [u32]> { /// /// # Safety /// - /// This routine is not safe because it does not check the valdity of the + /// This routine is not safe because it does not check the validity of the /// transition table itself. In particular, the transition table can be /// quite large, so checking its validity can be somewhat expensive. An /// invalid transition table is not safe because other code may rely on the @@ -2886,12 +3355,13 @@ impl<'a> TransitionTable<&'a [u32]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> { - let slice_start = slice.as_ptr() as usize; + let slice_start = slice.as_ptr().as_usize(); - let (count, nr) = bytes::try_read_u32_as_usize(slice, "state count")?; + let (state_len, nr) = + wire::try_read_u32_as_usize(slice, "state length")?; slice = &slice[nr..]; - let (stride2, nr) = bytes::try_read_u32_as_usize(slice, "stride2")?; + let (stride2, nr) = wire::try_read_u32_as_usize(slice, "stride2")?; slice = &slice[nr..]; let (classes, nr) = ByteClasses::from_bytes(slice)?; @@ -2922,37 +3392,32 @@ impl<'a> TransitionTable<&'a [u32]> { )); } - let trans_count = - bytes::shl(count, stride2, "dense table transition count")?; - let table_bytes_len = bytes::mul( - trans_count, + let trans_len = + wire::shl(state_len, stride2, "dense table transition length")?; + let table_bytes_len = wire::mul( + trans_len, StateID::SIZE, - "dense table state byte count", + "dense table state byte length", )?; - bytes::check_slice_len(slice, table_bytes_len, "transition table")?; - bytes::check_alignment::<StateID>(slice)?; + wire::check_slice_len(slice, table_bytes_len, "transition table")?; + wire::check_alignment::<StateID>(slice)?; let table_bytes = &slice[..table_bytes_len]; slice = &slice[table_bytes_len..]; // SAFETY: Since StateID is always representable as a u32, all we need // to do is ensure that we have the proper length and alignment. We've // checked both above, so the cast below is safe. // - // N.B. This is the only not-safe code in this function, so we mark - // it explicitly to call it out, even though it is technically - // superfluous. - #[allow(unused_unsafe)] - let table = unsafe { - core::slice::from_raw_parts( - table_bytes.as_ptr() as *const u32, - trans_count, - ) - }; + // N.B. This is the only not-safe code in this function. + let table = core::slice::from_raw_parts( + table_bytes.as_ptr().cast::<u32>(), + trans_len, + ); let tt = TransitionTable { table, classes, stride2 }; - Ok((tt, slice.as_ptr() as usize - slice_start)) + Ok((tt, slice.as_ptr().as_usize() - slice_start)) } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl TransitionTable<Vec<u32>> { /// Create a minimal transition table with just two states: a dead state /// and a quit state. The alphabet length and stride of the transition @@ -2985,7 +3450,7 @@ impl TransitionTable<Vec<u32>> { /// /// If adding a state would exhaust the state identifier space, then this /// returns an error. - fn add_empty_state(&mut self) -> Result<StateID, Error> { + fn add_empty_state(&mut self) -> Result<StateID, BuildError> { // Normally, to get a fresh state identifier, we would just // take the index of the next state added to the transition // table. However, we actually perform an optimization here @@ -3026,7 +3491,8 @@ impl TransitionTable<Vec<u32>> { // itself. e.g., If the stride is 64, then the ID of the 3rd state // is 192, not 2. let next = self.table.len(); - let id = StateID::new(next).map_err(|_| Error::too_many_states())?; + let id = + StateID::new(next).map_err(|_| BuildError::too_many_states())?; self.table.extend(iter::repeat(0).take(self.stride())); Ok(id) } @@ -3049,26 +3515,25 @@ impl TransitionTable<Vec<u32>> { } } - /// Truncate the states in this transition table to the given count. + /// Remap the transitions for the state given according to the function + /// given. This applies the given map function to every transition in the + /// given state and changes the transition in place to the result of the + /// map function for that transition. + fn remap(&mut self, id: StateID, map: impl Fn(StateID) -> StateID) { + for byte in 0..self.alphabet_len() { + let i = id.as_usize() + byte; + let next = self.table()[i]; + self.table_mut()[id.as_usize() + byte] = map(next); + } + } + + /// Truncate the states in this transition table to the given length. /// /// This routine does not do anything to check the correctness of this /// truncation. Callers must ensure that other states pointing to truncated /// states are updated appropriately. - fn truncate(&mut self, count: usize) { - self.table.truncate(count << self.stride2); - } - - /// Return a mutable representation of the state corresponding to the given - /// id. This is useful for implementing routines that manipulate DFA states - /// (e.g., swapping states). - fn state_mut(&mut self, id: StateID) -> StateMut<'_> { - let alphabet_len = self.alphabet_len(); - let i = id.as_usize(); - StateMut { - id, - stride2: self.stride2, - transitions: &mut self.table_mut()[i..i + alphabet_len], - } + fn truncate(&mut self, len: usize) { + self.table.truncate(len << self.stride2); } } @@ -3086,9 +3551,9 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { } dst = &mut dst[..nwrite]; - // write state count + // write state length // Unwrap is OK since number of states is guaranteed to fit in a u32. - E::write_u32(u32::try_from(self.count()).unwrap(), dst); + E::write_u32(u32::try_from(self.len()).unwrap(), dst); dst = &mut dst[size_of::<u32>()..]; // write state stride (as power of 2) @@ -3102,7 +3567,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { // write actual transitions for &sid in self.table() { - let n = bytes::write_state_id::<E>(sid, &mut dst); + let n = wire::write_state_id::<E>(sid, &mut dst); dst = &mut dst[n..]; } Ok(nwrite) @@ -3111,7 +3576,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { /// Returns the number of bytes the serialized form of this transition /// table will use. fn write_to_len(&self) -> usize { - size_of::<u32>() // state count + size_of::<u32>() // state length + size_of::<u32>() // stride2 + self.classes.write_to_len() + (self.table().len() * StateID::SIZE) @@ -3121,8 +3586,25 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self) -> Result<(), DeserializeError> { + fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { for state in self.states() { + // We check that the ID itself is well formed. That is, if it's + // a special state then it must actually be a quit, dead, accel, + // match or start state. + if sp.is_special_state(state.id()) { + let is_actually_special = sp.is_dead_state(state.id()) + || sp.is_quit_state(state.id()) + || sp.is_match_state(state.id()) + || sp.is_start_state(state.id()) + || sp.is_accel_state(state.id()); + if !is_actually_special { + // This is kind of a cryptic error message... + return Err(DeserializeError::generic( + "found dense state tagged as special but \ + wasn't actually special", + )); + } + } for (_, to) in state.transitions() { if !self.is_valid(to) { return Err(DeserializeError::generic( @@ -3145,7 +3627,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { /// Converts this transition table to an owned value. #[cfg(feature = "alloc")] - fn to_owned(&self) -> TransitionTable<Vec<u32>> { + fn to_owned(&self) -> TransitionTable<alloc::vec::Vec<u32>> { TransitionTable { table: self.table.as_ref().to_vec(), classes: self.classes.clone(), @@ -3179,7 +3661,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { } /// Convert a state identifier to an index to a state (in the range - /// 0..self.count()). + /// 0..self.len()). /// /// This is useful when using a `Vec<T>` as an efficient map keyed by state /// to some other information (such as a remapped state ID). @@ -3190,7 +3672,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { id.as_usize() >> self.stride2 } - /// Convert an index to a state (in the range 0..self.count()) to an actual + /// Convert an index to a state (in the range 0..self.len()) to an actual /// state identifier. /// /// This is useful when using a `Vec<T>` as an efficient map keyed by state @@ -3198,7 +3680,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { /// /// If the given index is not in the specified range, then this may panic /// or produce an incorrect state ID. - fn from_index(&self, index: usize) -> StateID { + fn to_state_id(&self, index: usize) -> StateID { // CORRECTNESS: If the given index is not valid, then it is not // required for this to panic or return a valid state ID. StateID::new_unchecked(index << self.stride2) @@ -3209,30 +3691,22 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { /// This does not check whether the state ID returned is invalid. In fact, /// if the state ID given is the last state in this DFA, then the state ID /// returned is guaranteed to be invalid. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn next_state_id(&self, id: StateID) -> StateID { - self.from_index(self.to_index(id).checked_add(1).unwrap()) + self.to_state_id(self.to_index(id).checked_add(1).unwrap()) } /// Returns the state ID for the state immediately preceding the one given. /// /// If the dead ID given (which is zero), then this panics. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn prev_state_id(&self, id: StateID) -> StateID { - self.from_index(self.to_index(id).checked_sub(1).unwrap()) + self.to_state_id(self.to_index(id).checked_sub(1).unwrap()) } /// Returns the table as a slice of state IDs. fn table(&self) -> &[StateID] { - let integers = self.table.as_ref(); - // SAFETY: This is safe because StateID is guaranteed to be - // representable as a u32. - unsafe { - core::slice::from_raw_parts( - integers.as_ptr() as *const StateID, - integers.len(), - ) - } + wire::u32s_to_state_ids(self.table.as_ref()) } /// Returns the total number of states in this transition table. @@ -3241,7 +3715,7 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { /// states. In particular, the dead state always has ID 0 and is /// correspondingly always the first state. The dead state is never a match /// state. - fn count(&self) -> usize { + fn len(&self) -> usize { self.table().len() >> self.stride2 } @@ -3277,19 +3751,11 @@ impl<T: AsRef<[u32]>> TransitionTable<T> { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl<T: AsMut<[u32]>> TransitionTable<T> { /// Returns the table as a slice of state IDs. fn table_mut(&mut self) -> &mut [StateID] { - let integers = self.table.as_mut(); - // SAFETY: This is safe because StateID is guaranteed to be - // representable as a u32. - unsafe { - core::slice::from_raw_parts_mut( - integers.as_mut_ptr() as *mut StateID, - integers.len(), - ) - } + wire::u32s_to_state_ids_mut(self.table.as_mut()) } } @@ -3330,10 +3796,10 @@ impl<T: AsMut<[u32]>> TransitionTable<T> { /// /// 1. If the search starts at the beginning of `context`, then the `Text` /// start state is used. (Since `^` corresponds to -/// `hir::Anchor::StartText`.) +/// `hir::Anchor::Start`.) /// 2. If the search starts at a position immediately following a line /// terminator, then the `Line` start state is used. (Since `(?m:^)` -/// corresponds to `hir::Anchor::StartLine`.) +/// corresponds to `hir::Anchor::StartLF`.) /// 3. If the search starts at a position immediately following a byte /// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte` /// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.) @@ -3372,23 +3838,41 @@ pub(crate) struct StartTable<T> { /// /// In practice, T is either `Vec<u32>` or `&[u32]`. /// - /// The first `stride` (currently always 4) entries always correspond to - /// the start states for the entire DFA. After that, there are - /// `stride * patterns` state IDs, where `patterns` may be zero in the - /// case of a DFA with no patterns or in the case where the DFA was built - /// without enabling starting states for each pattern. + /// The first `2 * stride` (currently always 8) entries always correspond + /// to the starts states for the entire DFA, with the first 4 entries being + /// for unanchored searches and the second 4 entries being for anchored + /// searches. To keep things simple, we always use 8 entries even if the + /// `StartKind` is not both. + /// + /// After that, there are `stride * patterns` state IDs, where `patterns` + /// may be zero in the case of a DFA with no patterns or in the case where + /// the DFA was built without enabling starting states for each pattern. table: T, + /// The starting state configuration supported. When 'both', both + /// unanchored and anchored searches work. When 'unanchored', anchored + /// searches panic. When 'anchored', unanchored searches panic. + kind: StartKind, + /// The start state configuration for every possible byte. + start_map: StartByteMap, /// The number of starting state IDs per pattern. stride: usize, /// The total number of patterns for which starting states are encoded. - /// This may be zero for non-empty DFAs when the DFA was built without - /// start states for each pattern. Thus, one cannot use this field to - /// say how many patterns are in the DFA in all cases. It is specific to - /// how many patterns are represented in this start table. - patterns: usize, + /// This is `None` for DFAs that were built without start states for each + /// pattern. Thus, one cannot use this field to say how many patterns + /// are in the DFA in all cases. It is specific to how many patterns are + /// represented in this start table. + pattern_len: Option<usize>, + /// The universal starting state for unanchored searches. This is only + /// present when the DFA supports unanchored searches and when all starting + /// state IDs for an unanchored search are equivalent. + universal_start_unanchored: Option<StateID>, + /// The universal starting state for anchored searches. This is only + /// present when the DFA supports anchored searches and when all starting + /// state IDs for an anchored search are equivalent. + universal_start_anchored: Option<StateID>, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl StartTable<Vec<u32>> { /// Create a valid set of start states all pointing to the dead state. /// @@ -3400,22 +3884,40 @@ impl StartTable<Vec<u32>> { /// returns an error. In practice, this is unlikely to be able to occur, /// since it's likely that allocation would have failed long before it got /// to this point. - fn dead(patterns: usize) -> Result<StartTable<Vec<u32>>, Error> { - assert!(patterns <= PatternID::LIMIT); - let stride = Start::count(); - let pattern_starts_len = match stride.checked_mul(patterns) { - Some(x) => x, - None => return Err(Error::too_many_start_states()), - }; - let table_len = match stride.checked_add(pattern_starts_len) { + fn dead( + kind: StartKind, + lookm: &LookMatcher, + pattern_len: Option<usize>, + ) -> Result<StartTable<Vec<u32>>, BuildError> { + if let Some(len) = pattern_len { + assert!(len <= PatternID::LIMIT); + } + let stride = Start::len(); + // OK because 2*4 is never going to overflow anything. + let starts_len = stride.checked_mul(2).unwrap(); + let pattern_starts_len = + match stride.checked_mul(pattern_len.unwrap_or(0)) { + Some(x) => x, + None => return Err(BuildError::too_many_start_states()), + }; + let table_len = match starts_len.checked_add(pattern_starts_len) { Some(x) => x, - None => return Err(Error::too_many_start_states()), + None => return Err(BuildError::too_many_start_states()), }; - if table_len > core::isize::MAX as usize { - return Err(Error::too_many_start_states()); + if let Err(_) = isize::try_from(table_len) { + return Err(BuildError::too_many_start_states()); } let table = vec![DEAD.as_u32(); table_len]; - Ok(StartTable { table, stride, patterns }) + let start_map = StartByteMap::new(lookm); + Ok(StartTable { + table, + kind, + start_map, + stride, + pattern_len, + universal_start_unanchored: None, + universal_start_anchored: None, + }) } } @@ -3433,7 +3935,7 @@ impl<'a> StartTable<&'a [u32]> { /// /// # Safety /// - /// This routine is not safe because it does not check the valdity of the + /// This routine is not safe because it does not check the validity of the /// starting state IDs themselves. In particular, the number of starting /// IDs can be of variable length, so it's possible that checking their /// validity cannot be done in constant time. An invalid starting state @@ -3447,61 +3949,104 @@ impl<'a> StartTable<&'a [u32]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> { - let slice_start = slice.as_ptr() as usize; + let slice_start = slice.as_ptr().as_usize(); - let (stride, nr) = - bytes::try_read_u32_as_usize(slice, "start table stride")?; + let (kind, nr) = StartKind::from_bytes(slice)?; slice = &slice[nr..]; - let (patterns, nr) = - bytes::try_read_u32_as_usize(slice, "start table patterns")?; + let (start_map, nr) = StartByteMap::from_bytes(slice)?; slice = &slice[nr..]; - if stride != Start::count() { + let (stride, nr) = + wire::try_read_u32_as_usize(slice, "start table stride")?; + slice = &slice[nr..]; + if stride != Start::len() { return Err(DeserializeError::generic( "invalid starting table stride", )); } - if patterns > PatternID::LIMIT { + + let (maybe_pattern_len, nr) = + wire::try_read_u32_as_usize(slice, "start table patterns")?; + slice = &slice[nr..]; + let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { + None + } else { + Some(maybe_pattern_len) + }; + if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { return Err(DeserializeError::generic( "invalid number of patterns", )); } - let pattern_table_size = - bytes::mul(stride, patterns, "invalid pattern count")?; - // Our start states always start with a single stride of start states - // for the entire automaton which permit it to match any pattern. What - // follows it are an optional set of start states for each pattern. - let start_state_count = bytes::add( + + let (universal_unanchored, nr) = + wire::try_read_u32(slice, "universal unanchored start")?; + slice = &slice[nr..]; + let universal_start_unanchored = if universal_unanchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_unanchored).map_err(|e| { + DeserializeError::state_id_error( + e, + "universal unanchored start", + ) + })?) + }; + + let (universal_anchored, nr) = + wire::try_read_u32(slice, "universal anchored start")?; + slice = &slice[nr..]; + let universal_start_anchored = if universal_anchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_anchored).map_err(|e| { + DeserializeError::state_id_error(e, "universal anchored start") + })?) + }; + + let pattern_table_size = wire::mul( stride, + pattern_len.unwrap_or(0), + "invalid pattern length", + )?; + // Our start states always start with a two stride of start states for + // the entire automaton. The first stride is for unanchored starting + // states and the second stride is for anchored starting states. What + // follows it are an optional set of start states for each pattern. + let start_state_len = wire::add( + wire::mul(2, stride, "start state stride too big")?, pattern_table_size, "invalid 'any' pattern starts size", )?; - let table_bytes_len = bytes::mul( - start_state_count, + let table_bytes_len = wire::mul( + start_state_len, StateID::SIZE, "pattern table bytes length", )?; - bytes::check_slice_len(slice, table_bytes_len, "start ID table")?; - bytes::check_alignment::<StateID>(slice)?; + wire::check_slice_len(slice, table_bytes_len, "start ID table")?; + wire::check_alignment::<StateID>(slice)?; let table_bytes = &slice[..table_bytes_len]; slice = &slice[table_bytes_len..]; // SAFETY: Since StateID is always representable as a u32, all we need // to do is ensure that we have the proper length and alignment. We've // checked both above, so the cast below is safe. // - // N.B. This is the only not-safe code in this function, so we mark - // it explicitly to call it out, even though it is technically - // superfluous. - #[allow(unused_unsafe)] - let table = unsafe { - core::slice::from_raw_parts( - table_bytes.as_ptr() as *const u32, - start_state_count, - ) + // N.B. This is the only not-safe code in this function. + let table = core::slice::from_raw_parts( + table_bytes.as_ptr().cast::<u32>(), + start_state_len, + ); + let st = StartTable { + table, + kind, + start_map, + stride, + pattern_len, + universal_start_unanchored, + universal_start_anchored, }; - let st = StartTable { table, stride, patterns }; - Ok((st, slice.as_ptr() as usize - slice_start)) + Ok((st, slice.as_ptr().as_usize() - slice_start)) } } @@ -3521,17 +4066,39 @@ impl<T: AsRef<[u32]>> StartTable<T> { } dst = &mut dst[..nwrite]; + // write start kind + let nw = self.kind.write_to::<E>(dst)?; + dst = &mut dst[nw..]; + // write start byte map + let nw = self.start_map.write_to(dst)?; + dst = &mut dst[nw..]; // write stride // Unwrap is OK since the stride is always 4 (currently). E::write_u32(u32::try_from(self.stride).unwrap(), dst); dst = &mut dst[size_of::<u32>()..]; - // write pattern count + // write pattern length // Unwrap is OK since number of patterns is guaranteed to fit in a u32. - E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + E::write_u32( + u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write universal start unanchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_unanchored + .map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write universal start anchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); dst = &mut dst[size_of::<u32>()..]; // write start IDs for &sid in self.table() { - let n = bytes::write_state_id::<E>(sid, &mut dst); + let n = wire::write_state_id::<E>(sid, &mut dst); dst = &mut dst[n..]; } Ok(nwrite) @@ -3540,8 +4107,12 @@ impl<T: AsRef<[u32]>> StartTable<T> { /// Returns the number of bytes the serialized form of this start ID table /// will use. fn write_to_len(&self) -> usize { - size_of::<u32>() // stride + self.kind.write_to_len() + + self.start_map.write_to_len() + + size_of::<u32>() // stride + size_of::<u32>() // # patterns + + size_of::<u32>() // universal unanchored start + + size_of::<u32>() // universal anchored start + (self.table().len() * StateID::SIZE) } @@ -3553,6 +4124,16 @@ impl<T: AsRef<[u32]>> StartTable<T> { &self, tt: &TransitionTable<T>, ) -> Result<(), DeserializeError> { + if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) { + return Err(DeserializeError::generic( + "found invalid universal unanchored starting state ID", + )); + } + if !self.universal_start_anchored.map_or(true, |s| tt.is_valid(s)) { + return Err(DeserializeError::generic( + "found invalid universal anchored starting state ID", + )); + } for &id in self.table() { if !tt.is_valid(id) { return Err(DeserializeError::generic( @@ -3567,38 +4148,72 @@ impl<T: AsRef<[u32]>> StartTable<T> { fn as_ref(&self) -> StartTable<&'_ [u32]> { StartTable { table: self.table.as_ref(), + kind: self.kind, + start_map: self.start_map.clone(), stride: self.stride, - patterns: self.patterns, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, } } /// Converts this start list to an owned value. #[cfg(feature = "alloc")] - fn to_owned(&self) -> StartTable<Vec<u32>> { + fn to_owned(&self) -> StartTable<alloc::vec::Vec<u32>> { StartTable { table: self.table.as_ref().to_vec(), + kind: self.kind, + start_map: self.start_map.clone(), stride: self.stride, - patterns: self.patterns, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, } } - /// Return the start state for the given start index and pattern ID. If the - /// pattern ID is None, then the corresponding start state for the entire - /// DFA is returned. If the pattern ID is not None, then the corresponding - /// starting state for the given pattern is returned. If this start table - /// does not have individual starting states for each pattern, then this - /// panics. - fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID { - let start_index = index.as_usize(); - let index = match pattern_id { - None => start_index, - Some(pid) => { - let pid = pid.as_usize(); - assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); - self.stride + (self.stride * pid) + start_index + /// Return the start state for the given input and starting configuration. + /// This returns an error if the input configuration is not supported by + /// this DFA. For example, requesting an unanchored search when the DFA was + /// not built with unanchored starting states. Or asking for an anchored + /// pattern search with an invalid pattern ID or on a DFA that was not + /// built with start states for each pattern. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn start( + &self, + input: &Input<'_>, + start: Start, + ) -> Result<StateID, MatchError> { + let start_index = start.as_usize(); + let mode = input.get_anchored(); + let index = match mode { + Anchored::No => { + if !self.kind.has_unanchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + start_index + } + Anchored::Yes => { + if !self.kind.has_anchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + self.stride + start_index + } + Anchored::Pattern(pid) => { + let len = match self.pattern_len { + None => { + return Err(MatchError::unsupported_anchored(mode)) + } + Some(len) => len, + }; + if pid.as_usize() >= len { + return Ok(DEAD); + } + (2 * self.stride) + + (self.stride * pid.as_usize()) + + start_index } }; - self.table()[index] + Ok(self.table()[index]) } /// Returns an iterator over all start state IDs in this table. @@ -3611,15 +4226,7 @@ impl<T: AsRef<[u32]>> StartTable<T> { /// Returns the table as a slice of state IDs. fn table(&self) -> &[StateID] { - let integers = self.table.as_ref(); - // SAFETY: This is safe because StateID is guaranteed to be - // representable as a u32. - unsafe { - core::slice::from_raw_parts( - integers.as_ptr() as *const StateID, - integers.len(), - ) - } + wire::u32s_to_state_ids(self.table.as_ref()) } /// Return the memory usage, in bytes, of this start list. @@ -3630,62 +4237,56 @@ impl<T: AsRef<[u32]>> StartTable<T> { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl<T: AsMut<[u32]>> StartTable<T> { /// Set the start state for the given index and pattern. /// /// If the pattern ID or state ID are not valid, then this will panic. - fn set_start( - &mut self, - index: Start, - pattern_id: Option<PatternID>, - id: StateID, - ) { - let start_index = index.as_usize(); - let index = match pattern_id { - None => start_index, - Some(pid) => self - .stride - .checked_mul(pid.as_usize()) - .unwrap() - .checked_add(self.stride) - .unwrap() - .checked_add(start_index) - .unwrap(), + fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { + let start_index = start.as_usize(); + let index = match anchored { + Anchored::No => start_index, + Anchored::Yes => self.stride + start_index, + Anchored::Pattern(pid) => { + let pid = pid.as_usize(); + let len = self + .pattern_len + .expect("start states for each pattern enabled"); + assert!(pid < len, "invalid pattern ID {:?}", pid); + self.stride + .checked_mul(pid) + .unwrap() + .checked_add(self.stride.checked_mul(2).unwrap()) + .unwrap() + .checked_add(start_index) + .unwrap() + } }; self.table_mut()[index] = id; } /// Returns the table as a mutable slice of state IDs. fn table_mut(&mut self) -> &mut [StateID] { - let integers = self.table.as_mut(); - // SAFETY: This is safe because StateID is guaranteed to be - // representable as a u32. - unsafe { - core::slice::from_raw_parts_mut( - integers.as_mut_ptr() as *mut StateID, - integers.len(), - ) - } + wire::u32s_to_state_ids_mut(self.table.as_mut()) } } /// An iterator over start state IDs. /// -/// This iterator yields a triple of start state ID, the start state type -/// and the pattern ID (if any). The pattern ID is None for start states -/// corresponding to the entire DFA and non-None for start states corresponding -/// to a specific pattern. The latter only occurs when the DFA is compiled with -/// start states for each pattern. +/// This iterator yields a triple of start state ID, the anchored mode and the +/// start state type. If a pattern ID is relevant, then the anchored mode will +/// contain it. Start states with an anchored mode containing a pattern ID will +/// only occur when the DFA was compiled with start states for each pattern +/// (which is disabled by default). pub(crate) struct StartStateIter<'a> { st: StartTable<&'a [u32]>, i: usize, } impl<'a> Iterator for StartStateIter<'a> { - type Item = (StateID, Start, Option<PatternID>); + type Item = (StateID, Anchored, Start); - fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> { + fn next(&mut self) -> Option<(StateID, Anchored, Start)> { let i = self.i; let table = self.st.table(); if i >= table.len() { @@ -3696,14 +4297,15 @@ impl<'a> Iterator for StartStateIter<'a> { // This unwrap is okay since the stride of the starting state table // must always match the number of start state types. let start_type = Start::from_usize(i % self.st.stride).unwrap(); - let pid = if i < self.st.stride { - None + let anchored = if i < self.st.stride { + Anchored::No + } else if i < (2 * self.st.stride) { + Anchored::Yes } else { - Some( - PatternID::new((i - self.st.stride) / self.st.stride).unwrap(), - ) + let pid = (i - (2 * self.st.stride)) / self.st.stride; + Anchored::Pattern(PatternID::new(pid).unwrap()) }; - Some((table[i], start_type, pid)) + Some((table[i], anchored, start_type)) } } @@ -3735,105 +4337,93 @@ struct MatchStates<T> { /// In practice, T is either Vec<u32> or &[u32]. pattern_ids: T, /// The total number of unique patterns represented by these match states. - patterns: usize, + pattern_len: usize, } impl<'a> MatchStates<&'a [u32]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> { - let slice_start = slice.as_ptr() as usize; + let slice_start = slice.as_ptr().as_usize(); // Read the total number of match states. - let (count, nr) = - bytes::try_read_u32_as_usize(slice, "match state count")?; + let (state_len, nr) = + wire::try_read_u32_as_usize(slice, "match state length")?; slice = &slice[nr..]; // Read the slice start/length pairs. - let pair_count = bytes::mul(2, count, "match state offset pairs")?; - let slices_bytes_len = bytes::mul( - pair_count, + let pair_len = wire::mul(2, state_len, "match state offset pairs")?; + let slices_bytes_len = wire::mul( + pair_len, PatternID::SIZE, "match state slice offset byte length", )?; - bytes::check_slice_len(slice, slices_bytes_len, "match state slices")?; - bytes::check_alignment::<PatternID>(slice)?; + wire::check_slice_len(slice, slices_bytes_len, "match state slices")?; + wire::check_alignment::<PatternID>(slice)?; let slices_bytes = &slice[..slices_bytes_len]; slice = &slice[slices_bytes_len..]; // SAFETY: Since PatternID is always representable as a u32, all we // need to do is ensure that we have the proper length and alignment. // We've checked both above, so the cast below is safe. // - // N.B. This is one of the few not-safe snippets in this function, so - // we mark it explicitly to call it out, even though it is technically - // superfluous. - #[allow(unused_unsafe)] - let slices = unsafe { - core::slice::from_raw_parts( - slices_bytes.as_ptr() as *const u32, - pair_count, - ) - }; + // N.B. This is one of the few not-safe snippets in this function, + // so we mark it explicitly to call it out. + let slices = core::slice::from_raw_parts( + slices_bytes.as_ptr().cast::<u32>(), + pair_len, + ); // Read the total number of unique pattern IDs (which is always 1 more // than the maximum pattern ID in this automaton, since pattern IDs are // handed out contiguously starting at 0). - let (patterns, nr) = - bytes::try_read_u32_as_usize(slice, "pattern count")?; + let (pattern_len, nr) = + wire::try_read_u32_as_usize(slice, "pattern length")?; slice = &slice[nr..]; - // Now read the pattern ID count. We don't need to store this + // Now read the pattern ID length. We don't need to store this // explicitly, but we need it to know how many pattern IDs to read. - let (idcount, nr) = - bytes::try_read_u32_as_usize(slice, "pattern ID count")?; + let (idlen, nr) = + wire::try_read_u32_as_usize(slice, "pattern ID length")?; slice = &slice[nr..]; // Read the actual pattern IDs. let pattern_ids_len = - bytes::mul(idcount, PatternID::SIZE, "pattern ID byte length")?; - bytes::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?; - bytes::check_alignment::<PatternID>(slice)?; + wire::mul(idlen, PatternID::SIZE, "pattern ID byte length")?; + wire::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?; + wire::check_alignment::<PatternID>(slice)?; let pattern_ids_bytes = &slice[..pattern_ids_len]; slice = &slice[pattern_ids_len..]; // SAFETY: Since PatternID is always representable as a u32, all we // need to do is ensure that we have the proper length and alignment. // We've checked both above, so the cast below is safe. // - // N.B. This is one of the few not-safe snippets in this function, so - // we mark it explicitly to call it out, even though it is technically - // superfluous. - #[allow(unused_unsafe)] - let pattern_ids = unsafe { - core::slice::from_raw_parts( - pattern_ids_bytes.as_ptr() as *const u32, - idcount, - ) - }; + // N.B. This is one of the few not-safe snippets in this function, + // so we mark it explicitly to call it out. + let pattern_ids = core::slice::from_raw_parts( + pattern_ids_bytes.as_ptr().cast::<u32>(), + idlen, + ); - let ms = MatchStates { slices, pattern_ids, patterns }; - Ok((ms, slice.as_ptr() as usize - slice_start)) + let ms = MatchStates { slices, pattern_ids, pattern_len }; + Ok((ms, slice.as_ptr().as_usize() - slice_start)) } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl MatchStates<Vec<u32>> { - fn empty(pattern_count: usize) -> MatchStates<Vec<u32>> { - assert!(pattern_count <= PatternID::LIMIT); - MatchStates { - slices: vec![], - pattern_ids: vec![], - patterns: pattern_count, - } + fn empty(pattern_len: usize) -> MatchStates<Vec<u32>> { + assert!(pattern_len <= PatternID::LIMIT); + MatchStates { slices: vec![], pattern_ids: vec![], pattern_len } } fn new( matches: &BTreeMap<StateID, Vec<PatternID>>, - pattern_count: usize, - ) -> Result<MatchStates<Vec<u32>>, Error> { - let mut m = MatchStates::empty(pattern_count); + pattern_len: usize, + ) -> Result<MatchStates<Vec<u32>>, BuildError> { + let mut m = MatchStates::empty(pattern_len); for (_, pids) in matches.iter() { let start = PatternID::new(m.pattern_ids.len()) - .map_err(|_| Error::too_many_match_pattern_ids())?; + .map_err(|_| BuildError::too_many_match_pattern_ids())?; m.slices.push(start.as_u32()); // This is always correct since the number of patterns in a single // match state can never exceed maximum number of allowable @@ -3846,15 +4436,15 @@ impl MatchStates<Vec<u32>> { m.pattern_ids.push(pid.as_u32()); } } - m.patterns = pattern_count; + m.pattern_len = pattern_len; Ok(m) } fn new_with_map( &self, matches: &BTreeMap<StateID, Vec<PatternID>>, - ) -> Result<MatchStates<Vec<u32>>, Error> { - MatchStates::new(matches, self.patterns) + ) -> Result<MatchStates<Vec<u32>>, BuildError> { + MatchStates::new(matches, self.pattern_len) } } @@ -3872,23 +4462,23 @@ impl<T: AsRef<[u32]>> MatchStates<T> { } dst = &mut dst[..nwrite]; - // write state ID count + // write state ID length // Unwrap is OK since number of states is guaranteed to fit in a u32. - E::write_u32(u32::try_from(self.count()).unwrap(), dst); + E::write_u32(u32::try_from(self.len()).unwrap(), dst); dst = &mut dst[size_of::<u32>()..]; // write slice offset pairs for &pid in self.slices() { - let n = bytes::write_pattern_id::<E>(pid, &mut dst); + let n = wire::write_pattern_id::<E>(pid, &mut dst); dst = &mut dst[n..]; } - // write unique pattern ID count + // write unique pattern ID length // Unwrap is OK since number of patterns is guaranteed to fit in a u32. - E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); dst = &mut dst[size_of::<u32>()..]; - // write pattern ID count + // write pattern ID length // Unwrap is OK since we check at construction (and deserialization) // that the number of patterns is representable as a u32. E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst); @@ -3896,32 +4486,32 @@ impl<T: AsRef<[u32]>> MatchStates<T> { // write pattern IDs for &pid in self.pattern_ids() { - let n = bytes::write_pattern_id::<E>(pid, &mut dst); + let n = wire::write_pattern_id::<E>(pid, &mut dst); dst = &mut dst[n..]; } Ok(nwrite) } - /// Returns the number of bytes the serialized form of this transition - /// table will use. + /// Returns the number of bytes the serialized form of these match states + /// will use. fn write_to_len(&self) -> usize { - size_of::<u32>() // match state count + size_of::<u32>() // match state length + (self.slices().len() * PatternID::SIZE) - + size_of::<u32>() // unique pattern ID count - + size_of::<u32>() // pattern ID count + + size_of::<u32>() // unique pattern ID length + + size_of::<u32>() // pattern ID length + (self.pattern_ids().len() * PatternID::SIZE) } /// Valides that the match state info is itself internally consistent and /// consistent with the recorded match state region in the given DFA. fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> { - if self.count() != dfa.special.match_len(dfa.stride()) { + if self.len() != dfa.special.match_len(dfa.stride()) { return Err(DeserializeError::generic( - "match state count mismatch", + "match state length mismatch", )); } - for si in 0..self.count() { + for si in 0..self.len() { let start = self.slices()[si * 2].as_usize(); let len = self.slices()[si * 2 + 1].as_usize(); if start >= self.pattern_ids().len() { @@ -3936,7 +4526,7 @@ impl<T: AsRef<[u32]>> MatchStates<T> { } for mi in 0..len { let pid = self.pattern_id(si, mi); - if pid.as_usize() >= self.patterns { + if pid.as_usize() >= self.pattern_len { return Err(DeserializeError::generic( "invalid pattern ID", )); @@ -3956,10 +4546,10 @@ impl<T: AsRef<[u32]>> MatchStates<T> { /// } /// /// Once shuffling is done, use MatchStates::new to convert back. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> { let mut map = BTreeMap::new(); - for i in 0..self.count() { + for i in 0..self.len() { let mut pids = vec![]; for j in 0..self.pattern_len(i) { pids.push(self.pattern_id(i, j)); @@ -3974,17 +4564,17 @@ impl<T: AsRef<[u32]>> MatchStates<T> { MatchStates { slices: self.slices.as_ref(), pattern_ids: self.pattern_ids.as_ref(), - patterns: self.patterns, + pattern_len: self.pattern_len, } } /// Converts these match states to an owned value. #[cfg(feature = "alloc")] - fn to_owned(&self) -> MatchStates<Vec<u32>> { + fn to_owned(&self) -> MatchStates<alloc::vec::Vec<u32>> { MatchStates { slices: self.slices.as_ref().to_vec(), pattern_ids: self.pattern_ids.as_ref().to_vec(), - patterns: self.patterns, + pattern_len: self.pattern_len, } } @@ -4015,6 +4605,7 @@ impl<T: AsRef<[u32]>> MatchStates<T> { /// /// The match index is the index of the pattern ID for the given state. /// The index must be less than `self.pattern_len(state_index)`. + #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID { self.pattern_id_slice(state_index)[match_index] } @@ -4023,6 +4614,7 @@ impl<T: AsRef<[u32]>> MatchStates<T> { /// /// The match state index is the state index minus the state index of the /// first match state in the DFA. + #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_len(&self, state_index: usize) -> usize { self.slices()[state_index * 2 + 1].as_usize() } @@ -4031,6 +4623,7 @@ impl<T: AsRef<[u32]>> MatchStates<T> { /// /// The match state index is the state index minus the state index of the /// first match state in the DFA. + #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] { let start = self.slices()[state_index * 2].as_usize(); let len = self.pattern_len(state_index); @@ -4038,35 +4631,22 @@ impl<T: AsRef<[u32]>> MatchStates<T> { } /// Returns the pattern ID offset slice of u32 as a slice of PatternID. + #[cfg_attr(feature = "perf-inline", inline(always))] fn slices(&self) -> &[PatternID] { - let integers = self.slices.as_ref(); - // SAFETY: This is safe because PatternID is guaranteed to be - // representable as a u32. - unsafe { - core::slice::from_raw_parts( - integers.as_ptr() as *const PatternID, - integers.len(), - ) - } + wire::u32s_to_pattern_ids(self.slices.as_ref()) } /// Returns the total number of match states. - fn count(&self) -> usize { + #[cfg_attr(feature = "perf-inline", inline(always))] + fn len(&self) -> usize { assert_eq!(0, self.slices().len() % 2); self.slices().len() / 2 } /// Returns the pattern ID slice of u32 as a slice of PatternID. + #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_ids(&self) -> &[PatternID] { - let integers = self.pattern_ids.as_ref(); - // SAFETY: This is safe because PatternID is guaranteed to be - // representable as a u32. - unsafe { - core::slice::from_raw_parts( - integers.as_ptr() as *const PatternID, - integers.len(), - ) - } + wire::u32s_to_pattern_ids(self.pattern_ids.as_ref()) } /// Return the memory usage, in bytes, of these match pairs. @@ -4075,6 +4655,86 @@ impl<T: AsRef<[u32]>> MatchStates<T> { } } +/// A common set of flags for both dense and sparse DFAs. This primarily +/// centralizes the serialization format of these flags at a bitset. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Flags { + /// Whether the DFA can match the empty string. When this is false, all + /// matches returned by this DFA are guaranteed to have non-zero length. + pub(crate) has_empty: bool, + /// Whether the DFA should only produce matches with spans that correspond + /// to valid UTF-8. This also includes omitting any zero-width matches that + /// split the UTF-8 encoding of a codepoint. + pub(crate) is_utf8: bool, + /// Whether the DFA is always anchored or not, regardless of `Input` + /// configuration. This is useful for avoiding a reverse scan even when + /// executing unanchored searches. + pub(crate) is_always_start_anchored: bool, +} + +impl Flags { + /// Creates a set of flags for a DFA from an NFA. + /// + /// N.B. This constructor was defined at the time of writing because all + /// of the flags are derived directly from the NFA. If this changes in the + /// future, we might be more thoughtful about how the `Flags` value is + /// itself built. + #[cfg(feature = "dfa-build")] + fn from_nfa(nfa: &thompson::NFA) -> Flags { + Flags { + has_empty: nfa.has_empty(), + is_utf8: nfa.is_utf8(), + is_always_start_anchored: nfa.is_always_start_anchored(), + } + } + + /// Deserializes the flags from the given slice. On success, this also + /// returns the number of bytes read from the slice. + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(Flags, usize), DeserializeError> { + let (bits, nread) = wire::try_read_u32(slice, "flag bitset")?; + let flags = Flags { + has_empty: bits & (1 << 0) != 0, + is_utf8: bits & (1 << 1) != 0, + is_always_start_anchored: bits & (1 << 2) != 0, + }; + Ok((flags, nread)) + } + + /// Writes these flags to the given byte slice. If the buffer is too small, + /// then an error is returned. To determine how big the buffer must be, + /// use `write_to_len`. + pub(crate) fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + fn bool_to_int(b: bool) -> u32 { + if b { + 1 + } else { + 0 + } + } + + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("flag bitset")); + } + let bits = (bool_to_int(self.has_empty) << 0) + | (bool_to_int(self.is_utf8) << 1) + | (bool_to_int(self.is_always_start_anchored) << 2); + E::write_u32(bits, dst); + Ok(nwrite) + } + + /// Returns the number of bytes the serialized form of these flags + /// will use. + pub(crate) fn write_to_len(&self) -> usize { + size_of::<u32>() + } +} + /// An iterator over all states in a DFA. /// /// This iterator yields a tuple for each state. The first element of the @@ -4093,7 +4753,7 @@ impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> { fn next(&mut self) -> Option<State<'a>> { self.it.next().map(|(index, _)| { - let id = self.tt.from_index(index); + let id = self.tt.to_state_id(index); self.tt.state(id) }) } @@ -4146,7 +4806,7 @@ impl<'a> State<'a> { /// Analyzes this state to determine whether it can be accelerated. If so, /// it returns an accelerator that contains at least one byte. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> { // We just try to add bytes to our accelerator. Once adding fails // (because we've added too many bytes), then give up. @@ -4173,66 +4833,25 @@ impl<'a> State<'a> { impl<'a> fmt::Debug for State<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for (i, (start, end, id)) in self.sparse_transitions().enumerate() { - let index = if f.alternate() { - id.as_usize() + for (i, (start, end, sid)) in self.sparse_transitions().enumerate() { + let id = if f.alternate() { + sid.as_usize() } else { - id.as_usize() >> self.stride2 + sid.as_usize() >> self.stride2 }; if i > 0 { write!(f, ", ")?; } if start == end { - write!(f, "{:?} => {:?}", start, index)?; + write!(f, "{:?} => {:?}", start, id)?; } else { - write!(f, "{:?}-{:?} => {:?}", start, end, index)?; + write!(f, "{:?}-{:?} => {:?}", start, end, id)?; } } Ok(()) } } -/// A mutable representation of a single DFA state. -/// -/// `'a` correspondings to the lifetime of a DFA's transition table. -#[cfg(feature = "alloc")] -pub(crate) struct StateMut<'a> { - id: StateID, - stride2: usize, - transitions: &'a mut [StateID], -} - -#[cfg(feature = "alloc")] -impl<'a> StateMut<'a> { - /// Return an iterator over all transitions in this state. This yields - /// a number of transitions equivalent to the alphabet length of the - /// corresponding DFA. - /// - /// Each transition is represented by a tuple. The first element is the - /// input byte for that transition and the second element is a mutable - /// reference to the transition itself. - pub(crate) fn iter_mut(&mut self) -> StateTransitionIterMut<'_> { - StateTransitionIterMut { - len: self.transitions.len(), - it: self.transitions.iter_mut().enumerate(), - } - } -} - -#[cfg(feature = "alloc")] -impl<'a> fmt::Debug for StateMut<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt( - &State { - id: self.id, - stride2: self.stride2, - transitions: self.transitions, - }, - f, - ) - } -} - /// An iterator over all transitions in a single DFA state. This yields /// a number of transitions equivalent to the alphabet length of the /// corresponding DFA. @@ -4262,36 +4881,6 @@ impl<'a> Iterator for StateTransitionIter<'a> { } } -/// A mutable iterator over all transitions in a DFA state. -/// -/// Each transition is represented by a tuple. The first element is the -/// input byte for that transition and the second element is a mutable -/// reference to the transition itself. -#[cfg(feature = "alloc")] -#[derive(Debug)] -pub(crate) struct StateTransitionIterMut<'a> { - len: usize, - it: iter::Enumerate<slice::IterMut<'a, StateID>>, -} - -#[cfg(feature = "alloc")] -impl<'a> Iterator for StateTransitionIterMut<'a> { - type Item = (alphabet::Unit, &'a mut StateID); - - fn next(&mut self) -> Option<(alphabet::Unit, &'a mut StateID)> { - self.it.next().map(|(i, id)| { - let unit = if i + 1 == self.len { - alphabet::Unit::eoi(i) - } else { - let b = u8::try_from(i) - .expect("raw byte alphabet is never exceeded"); - alphabet::Unit::u8(b) - }; - (unit, id) - }) - } -} - /// An iterator over all non-DEAD transitions in a single DFA state using a /// sparse representation. /// @@ -4338,104 +4927,164 @@ impl<'a> Iterator for StateSparseTransitionIter<'a> { } } -/// An iterator over pattern IDs for a single match state. -#[derive(Debug)] -pub(crate) struct PatternIDIter<'a>(slice::Iter<'a, PatternID>); - -impl<'a> Iterator for PatternIDIter<'a> { - type Item = PatternID; - - fn next(&mut self) -> Option<PatternID> { - self.0.next().copied() - } +/// An error that occurred during the construction of a DFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`nfa::thompson::BuildError`](thompson::BuildError) +/// type from its `source` method via the `std::error::Error` trait. This error +/// only occurs when using convenience routines for building a DFA directly +/// from a pattern string. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[cfg(feature = "dfa-build")] +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, } -/// Remapper is an abstraction the manages the remapping of state IDs in a -/// dense DFA. This is useful when one wants to shuffle states into different -/// positions in the DFA. -/// -/// One of the key complexities this manages is the ability to correctly move -/// one state multiple times. +/// The kind of error that occurred during the construction of a DFA. /// -/// Once shuffling is complete, `remap` should be called, which will rewrite -/// all pertinent transitions to updated state IDs. -#[cfg(feature = "alloc")] -#[derive(Debug)] -struct Remapper { - /// A map from the index of a state to its pre-multiplied identifier. - /// - /// When a state is swapped with another, then their corresponding - /// locations in this map are also swapped. Thus, its new position will - /// still point to its old pre-multiplied StateID. - /// - /// While there is a bit more to it, this then allows us to rewrite the - /// state IDs in a DFA's transition table in a single pass. This is done - /// by iterating over every ID in this map, then iterating over each - /// transition for the state at that ID and re-mapping the transition from - /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position - /// in this map where `old_id` *started*, and set it to where it ended up - /// after all swaps have been completed. - map: Vec<StateID>, +/// Note that this error is non-exhaustive. Adding new variants is not +/// considered a breaking change. +#[cfg(feature = "dfa-build")] +#[derive(Clone, Debug)] +enum BuildErrorKind { + /// An error that occurred while constructing an NFA as a precursor step + /// before a DFA is compiled. + NFA(thompson::BuildError), + /// An error that occurred because an unsupported regex feature was used. + /// The message string describes which unsupported feature was used. + /// + /// The primary regex feature that is unsupported by DFAs is the Unicode + /// word boundary look-around assertion (`\b`). This can be worked around + /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling + /// Unicode word boundaries when building a DFA. + Unsupported(&'static str), + /// An error that occurs if too many states are produced while building a + /// DFA. + TooManyStates, + /// An error that occurs if too many start states are needed while building + /// a DFA. + /// + /// This is a kind of oddball error that occurs when building a DFA with + /// start states enabled for each pattern and enough patterns to cause + /// the table of start states to overflow `usize`. + TooManyStartStates, + /// This is another oddball error that can occur if there are too many + /// patterns spread out across too many match states. + TooManyMatchPatternIDs, + /// An error that occurs if the DFA got too big during determinization. + DFAExceededSizeLimit { limit: usize }, + /// An error that occurs if auxiliary storage (not the DFA) used during + /// determinization got too big. + DeterminizeExceededSizeLimit { limit: usize }, } -#[cfg(feature = "alloc")] -impl Remapper { - fn from_dfa(dfa: &OwnedDFA) -> Remapper { - Remapper { - map: (0..dfa.state_count()).map(|i| dfa.from_index(i)).collect(), +#[cfg(feature = "dfa-build")] +impl BuildError { + /// Return the kind of this error. + fn kind(&self) -> &BuildErrorKind { + &self.kind + } + + pub(crate) fn nfa(err: thompson::BuildError) -> BuildError { + BuildError { kind: BuildErrorKind::NFA(err) } + } + + pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError { + let msg = "cannot build DFAs for regexes with Unicode word \ + boundaries; switch to ASCII word boundaries, or \ + heuristically enable Unicode word boundaries or use a \ + different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } + + pub(crate) fn too_many_states() -> BuildError { + BuildError { kind: BuildErrorKind::TooManyStates } + } + + pub(crate) fn too_many_start_states() -> BuildError { + BuildError { kind: BuildErrorKind::TooManyStartStates } + } + + pub(crate) fn too_many_match_pattern_ids() -> BuildError { + BuildError { kind: BuildErrorKind::TooManyMatchPatternIDs } + } + + pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> BuildError { + BuildError { kind: BuildErrorKind::DFAExceededSizeLimit { limit } } + } + + pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> BuildError { + BuildError { + kind: BuildErrorKind::DeterminizeExceededSizeLimit { limit }, } } +} - fn swap(&mut self, dfa: &mut OwnedDFA, id1: StateID, id2: StateID) { - dfa.swap_states(id1, id2); - self.map.swap(dfa.to_index(id1), dfa.to_index(id2)); +#[cfg(all(feature = "std", feature = "dfa-build"))] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + BuildErrorKind::NFA(ref err) => Some(err), + _ => None, + } } +} - fn remap(mut self, dfa: &mut OwnedDFA) { - // Update the map to account for states that have been swapped - // multiple times. For example, if (A, C) and (C, G) are swapped, then - // transitions previously pointing to A should now point to G. But if - // we don't update our map, they will erroneously be set to C. All we - // do is follow the swaps in our map until we see our original state - // ID. - let oldmap = self.map.clone(); - for i in 0..dfa.state_count() { - let cur_id = dfa.from_index(i); - let mut new = oldmap[i]; - if cur_id == new { - continue; +#[cfg(feature = "dfa-build")] +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + BuildErrorKind::NFA(_) => write!(f, "error building NFA"), + BuildErrorKind::Unsupported(ref msg) => { + write!(f, "unsupported regex feature for DFAs: {}", msg) } - loop { - let id = oldmap[dfa.to_index(new)]; - if cur_id == id { - self.map[i] = new; - break; - } - new = id; + BuildErrorKind::TooManyStates => write!( + f, + "number of DFA states exceeds limit of {}", + StateID::LIMIT, + ), + BuildErrorKind::TooManyStartStates => { + let stride = Start::len(); + // The start table has `stride` entries for starting states for + // the entire DFA, and then `stride` entries for each pattern + // if start states for each pattern are enabled (which is the + // only way this error can occur). Thus, the total number of + // patterns that can fit in the table is `stride` less than + // what we can allocate. + let max = usize::try_from(core::isize::MAX).unwrap(); + let limit = (max - stride) / stride; + write!( + f, + "compiling DFA with start states exceeds pattern \ + pattern limit of {}", + limit, + ) } - } - - // To work around the borrow checker for converting state IDs to - // indices. We cannot borrow self while mutably iterating over a - // state's transitions. Otherwise, we'd just use dfa.to_index(..). - let stride2 = dfa.stride2(); - let to_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; - - // Now that we've finished shuffling, we need to remap all of our - // transitions. We don't need to handle re-mapping accelerated states - // since `accels` is only populated after shuffling. - for &id in self.map.iter() { - for (_, next_id) in dfa.state_mut(id).iter_mut() { - *next_id = self.map[to_index(*next_id)]; + BuildErrorKind::TooManyMatchPatternIDs => write!( + f, + "compiling DFA with total patterns in all match states \ + exceeds limit of {}", + PatternID::LIMIT, + ), + BuildErrorKind::DFAExceededSizeLimit { limit } => write!( + f, + "DFA exceeded size limit of {:?} during determinization", + limit, + ), + BuildErrorKind::DeterminizeExceededSizeLimit { limit } => { + write!(f, "determinization exceeded size limit of {:?}", limit) } } - for start_id in dfa.st.table_mut().iter_mut() { - *start_id = self.map[to_index(*start_id)]; - } } } -#[cfg(all(test, feature = "alloc"))] +#[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { use super::*; @@ -4451,7 +5100,7 @@ mod tests { let (buf, _) = dfa.to_bytes_native_endian(); let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; - assert_eq!(None, dfa.find_leftmost_fwd(b"foo12345").unwrap()); + assert_eq!(None, dfa.try_search_fwd(&Input::new("foo12345")).unwrap()); } #[test] @@ -4464,7 +5113,27 @@ mod tests { assert_eq!( Some(HalfMatch::must(0, 0)), - dfa.find_leftmost_fwd(b"foo12345").unwrap() + dfa.try_search_fwd(&Input::new("foo12345")).unwrap() ); } + + // See the analogous test in src/hybrid/dfa.rs. + #[test] + fn heuristic_unicode_reverse() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + } } diff --git a/vendor/regex-automata/src/dfa/determinize.rs b/vendor/regex-automata/src/dfa/determinize.rs index 61603481b..19f99f5d6 100644 --- a/vendor/regex-automata/src/dfa/determinize.rs +++ b/vendor/regex-automata/src/dfa/determinize.rs @@ -1,18 +1,18 @@ -use alloc::{ - collections::BTreeMap, - vec::{self, Vec}, -}; +use alloc::{collections::BTreeMap, vec::Vec}; use crate::{ - dfa::{dense, Error, DEAD}, + dfa::{ + dense::{self, BuildError}, + DEAD, + }, nfa::thompson, util::{ self, alphabet::{self, ByteSet}, determinize::{State, StateBuilderEmpty, StateBuilderNFA}, - id::{PatternID, StateID}, - matchtypes::MatchKind, - sparse_set::{SparseSet, SparseSets}, + primitives::{PatternID, StateID}, + search::{Anchored, MatchKind}, + sparse_set::SparseSets, start::Start, }, }; @@ -20,7 +20,6 @@ use crate::{ /// A builder for configuring and running a DFA determinizer. #[derive(Clone, Debug)] pub(crate) struct Config { - anchored: bool, match_kind: MatchKind, quit: ByteSet, dfa_size_limit: Option<usize>, @@ -32,7 +31,6 @@ impl Config { /// configured before calling `run`. pub fn new() -> Config { Config { - anchored: false, match_kind: MatchKind::LeftmostFirst, quit: ByteSet::empty(), dfa_size_limit: None, @@ -48,7 +46,7 @@ impl Config { &self, nfa: &thompson::NFA, dfa: &mut dense::OwnedDFA, - ) -> Result<(), Error> { + ) -> Result<(), BuildError> { let dead = State::dead(); let quit = State::dead(); let mut cache = StateMap::default(); @@ -71,21 +69,13 @@ impl Config { builder_states: alloc::vec![dead, quit], cache, memory_usage_state: 0, - sparses: SparseSets::new(nfa.len()), + sparses: SparseSets::new(nfa.states().len()), stack: alloc::vec![], scratch_state_builder: StateBuilderEmpty::new(), }; runner.run() } - /// Whether to build an anchored DFA or not. When disabled (the default), - /// the unanchored prefix from the NFA is used to start the DFA. Otherwise, - /// the anchored start state of the NFA is used to start the DFA. - pub fn anchored(&mut self, yes: bool) -> &mut Config { - self.anchored = yes; - self - } - /// The match semantics to use for determinization. /// /// MatchKind::All corresponds to the standard textbook construction. @@ -222,20 +212,21 @@ impl<'a> Runner<'a> { /// Build the DFA. If there was a problem constructing the DFA (e.g., if /// the chosen state identifier representation is too small), then an error /// is returned. - fn run(mut self) -> Result<(), Error> { - if self.nfa.has_word_boundary_unicode() + fn run(mut self) -> Result<(), BuildError> { + if self.nfa.look_set_any().contains_word_unicode() && !self.config.quit.contains_range(0x80, 0xFF) { - return Err(Error::unsupported_dfa_word_boundary_unicode()); + return Err(BuildError::unsupported_dfa_word_boundary_unicode()); } // A sequence of "representative" bytes drawn from each equivalence // class. These representative bytes are fed to the NFA to compute // state transitions. This allows us to avoid re-computing state // transitions for bytes that are guaranteed to produce identical - // results. + // results. Since computing the representatives needs to do a little + // work, we do it once here because we'll be iterating over them a lot. let representatives: Vec<alphabet::Unit> = - self.dfa.byte_classes().representatives().collect(); + self.dfa.byte_classes().representatives(..).collect(); // The set of all DFA state IDs that still need to have their // transitions set. We start by seeding this with all starting states. let mut uncompiled = alloc::vec![]; @@ -259,10 +250,13 @@ impl<'a> Runner<'a> { } } } - trace!( - "determinization complete, memory usage: {}, dense DFA size: {}", + debug!( + "determinization complete, memory usage: {}, \ + dense DFA size: {}, \ + is reverse? {}", self.memory_usage(), self.dfa.memory_usage(), + self.nfa.is_reverse(), ); // A map from DFA state ID to one or more NFA match IDs. Each NFA match @@ -270,21 +264,23 @@ impl<'a> Runner<'a> { // corresponding to the key. let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new(); self.cache.clear(); - #[allow(unused_variables)] - let mut total_pat_count = 0; + #[cfg(feature = "logging")] + let mut total_pat_len = 0; for (i, state) in self.builder_states.into_iter().enumerate() { if let Some(pat_ids) = state.match_pattern_ids() { - let id = self.dfa.from_index(i); - total_pat_count += pat_ids.len(); + let id = self.dfa.to_state_id(i); + log! { + total_pat_len += pat_ids.len(); + } matches.insert(id, pat_ids); } } log! { use core::mem::size_of; let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>(); - let pats = total_pat_count * size_of::<PatternID>(); + let pats = total_pat_len * size_of::<PatternID>(); let mem = (matches.len() * per_elem) + pats; - log::trace!("matches map built, memory usage: {}", mem); + log::debug!("matches map built, memory usage: {}", mem); } // At this point, we shuffle the "special" states in the final DFA. // This permits a DFA's match loop to detect a match condition (among @@ -306,7 +302,7 @@ impl<'a> Runner<'a> { &mut self, dfa_id: StateID, unit: alphabet::Unit, - ) -> Result<(StateID, bool), Error> { + ) -> Result<(StateID, bool), BuildError> { // Compute the set of all reachable NFA states, including epsilons. let empty_builder = self.get_state_builder(); let builder = util::determinize::next( @@ -326,15 +322,32 @@ impl<'a> Runner<'a> { fn add_all_starts( &mut self, dfa_state_ids: &mut Vec<StateID>, - ) -> Result<(), Error> { - // Always add the (possibly unanchored) start states for matching any - // of the patterns in this DFA. - self.add_start_group(None, dfa_state_ids)?; + ) -> Result<(), BuildError> { + // These should be the first states added. + assert!(dfa_state_ids.is_empty()); + // We only want to add (un)anchored starting states that is consistent + // with our DFA's configuration. Unconditionally adding both (although + // it is the default) can make DFAs quite a bit bigger. + if self.dfa.start_kind().has_unanchored() { + self.add_start_group(Anchored::No, dfa_state_ids)?; + } + if self.dfa.start_kind().has_anchored() { + self.add_start_group(Anchored::Yes, dfa_state_ids)?; + } + // I previously has an 'assert' here checking that either + // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it + // turns out this isn't always true. For example, the NFA might have + // one or more patterns but where all such patterns are just 'fail' + // states. These will ultimately just compile down to DFA dead states, + // and since the dead state was added earlier, no new DFA states are + // added. And thus, it is valid and okay for 'dfa_state_ids' to be + // empty even if there are a non-zero number of patterns in the NFA. + // We only need to compute anchored start states for each pattern if it // was requested to do so. - if self.dfa.has_starts_for_each_pattern() { - for pid in PatternID::iter(self.dfa.pattern_count()) { - self.add_start_group(Some(pid), dfa_state_ids)?; + if self.dfa.starts_for_each_pattern() { + for pid in self.nfa.patterns() { + self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?; } } Ok(()) @@ -348,15 +361,19 @@ impl<'a> Runner<'a> { /// start states (if the DFA is unanchored). When the pattern_id is /// present, then this will compile a group of anchored start states that /// only match the given pattern. + /// + /// This panics if `anchored` corresponds to an invalid pattern ID. fn add_start_group( &mut self, - pattern_id: Option<PatternID>, + anchored: Anchored, dfa_state_ids: &mut Vec<StateID>, - ) -> Result<(), Error> { - let nfa_start = match pattern_id { - Some(pid) => self.nfa.start_pattern(pid), - None if self.config.anchored => self.nfa.start_anchored(), - None => self.nfa.start_unanchored(), + ) -> Result<(), BuildError> { + let nfa_start = match anchored { + Anchored::No => self.nfa.start_unanchored(), + Anchored::Yes => self.nfa.start_anchored(), + Anchored::Pattern(pid) => { + self.nfa.start_pattern(pid).expect("valid pattern ID") + } }; // When compiling start states, we're careful not to build additional @@ -365,36 +382,68 @@ impl<'a> Runner<'a> { // states for 'NonWordByte' and 'WordByte' starting configurations. // Instead, the 'WordByte' starting configuration can just point // directly to the start state for the 'NonWordByte' config. + // + // Note though that we only need to care about assertions in the prefix + // of an NFA since this only concerns the starting states. (Actually, + // the most precisely thing we could do it is look at the prefix + // assertions of each pattern when 'anchored == Anchored::Pattern', + // and then only compile extra states if the prefix is non-empty.) But + // we settle for simplicity here instead of absolute minimalism. It is + // somewhat rare, after all, for multiple patterns in the same regex to + // have different prefix look-arounds. let (id, is_new) = self.add_one_start(nfa_start, Start::NonWordByte)?; - self.dfa.set_start_state(Start::NonWordByte, pattern_id, id); + self.dfa.set_start_state(anchored, Start::NonWordByte, id); if is_new { dfa_state_ids.push(id); } - if !self.nfa.has_word_boundary() { - self.dfa.set_start_state(Start::WordByte, pattern_id, id); + if !self.nfa.look_set_prefix_any().contains_word() { + self.dfa.set_start_state(anchored, Start::WordByte, id); } else { let (id, is_new) = self.add_one_start(nfa_start, Start::WordByte)?; - self.dfa.set_start_state(Start::WordByte, pattern_id, id); + self.dfa.set_start_state(anchored, Start::WordByte, id); if is_new { dfa_state_ids.push(id); } } - if !self.nfa.has_any_anchor() { - self.dfa.set_start_state(Start::Text, pattern_id, id); - self.dfa.set_start_state(Start::Line, pattern_id, id); + if !self.nfa.look_set_prefix_any().contains_anchor() { + self.dfa.set_start_state(anchored, Start::Text, id); + self.dfa.set_start_state(anchored, Start::LineLF, id); + self.dfa.set_start_state(anchored, Start::LineCR, id); + self.dfa.set_start_state( + anchored, + Start::CustomLineTerminator, + id, + ); } else { let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?; - self.dfa.set_start_state(Start::Text, pattern_id, id); + self.dfa.set_start_state(anchored, Start::Text, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?; + self.dfa.set_start_state(anchored, Start::LineLF, id); if is_new { dfa_state_ids.push(id); } - let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?; - self.dfa.set_start_state(Start::Line, pattern_id, id); + let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?; + self.dfa.set_start_state(anchored, Start::LineCR, id); + if is_new { + dfa_state_ids.push(id); + } + + let (id, is_new) = + self.add_one_start(nfa_start, Start::CustomLineTerminator)?; + self.dfa.set_start_state( + anchored, + Start::CustomLineTerminator, + id, + ); if is_new { dfa_state_ids.push(id); } @@ -414,13 +463,14 @@ impl<'a> Runner<'a> { &mut self, nfa_start: StateID, start: Start, - ) -> Result<(StateID, bool), Error> { + ) -> Result<(StateID, bool), BuildError> { // Compute the look-behind assertions that are true in this starting // configuration, and the determine the epsilon closure. While // computing the epsilon closure, we only follow condiional epsilon - // transitions that satisfy the look-behind assertions in 'facts'. + // transitions that satisfy the look-behind assertions in 'look_have'. let mut builder_matches = self.get_state_builder().into_matches(); util::determinize::set_lookbehind_from_start( + self.nfa, &start, &mut builder_matches, ); @@ -428,7 +478,7 @@ impl<'a> Runner<'a> { util::determinize::epsilon_closure( self.nfa, nfa_start, - *builder_matches.look_have(), + builder_matches.look_have(), &mut self.stack, &mut self.sparses.set1, ); @@ -455,7 +505,7 @@ impl<'a> Runner<'a> { fn maybe_add_state( &mut self, builder: StateBuilderNFA, - ) -> Result<(StateID, bool), Error> { + ) -> Result<(StateID, bool), BuildError> { if let Some(&cached_id) = self.cache.get(builder.as_bytes()) { // Since we have a cached state, put the constructed state's // memory back into our scratch space, so that it can be reused. @@ -476,7 +526,7 @@ impl<'a> Runner<'a> { fn add_state( &mut self, builder: StateBuilderNFA, - ) -> Result<StateID, Error> { + ) -> Result<StateID, BuildError> { let id = self.dfa.add_empty_state()?; if !self.config.quit.is_empty() { for b in self.config.quit.iter() { @@ -489,19 +539,21 @@ impl<'a> Runner<'a> { } let state = builder.to_state(); // States use reference counting internally, so we only need to count - // their memroy usage once. + // their memory usage once. self.memory_usage_state += state.memory_usage(); self.builder_states.push(state.clone()); self.cache.insert(state, id); self.put_state_builder(builder); if let Some(limit) = self.config.dfa_size_limit { if self.dfa.memory_usage() > limit { - return Err(Error::dfa_exceeded_size_limit(limit)); + return Err(BuildError::dfa_exceeded_size_limit(limit)); } } if let Some(limit) = self.config.determinize_size_limit { if self.memory_usage() > limit { - return Err(Error::determinize_exceeded_size_limit(limit)); + return Err(BuildError::determinize_exceeded_size_limit( + limit, + )); } } Ok(id) diff --git a/vendor/regex-automata/src/dfa/error.rs b/vendor/regex-automata/src/dfa/error.rs deleted file mode 100644 index 6497a4cff..000000000 --- a/vendor/regex-automata/src/dfa/error.rs +++ /dev/null @@ -1,162 +0,0 @@ -use crate::{ - nfa, - util::{ - id::{PatternID, StateID}, - start::Start, - }, -}; - -/// An error that occurred during the construction of a DFA. -/// -/// This error does not provide many introspection capabilities. There are -/// generally only two things you can do with it: -/// -/// * Obtain a human readable message via its `std::fmt::Display` impl. -/// * Access an underlying [`nfa::thompson::Error`] type from its `source` -/// method via the `std::error::Error` trait. This error only occurs when using -/// convenience routines for building a DFA directly from a pattern string. -/// -/// When the `std` feature is enabled, this implements the `std::error::Error` -/// trait. -#[derive(Clone, Debug)] -pub struct Error { - kind: ErrorKind, -} - -/// The kind of error that occurred during the construction of a DFA. -/// -/// Note that this error is non-exhaustive. Adding new variants is not -/// considered a breaking change. -#[derive(Clone, Debug)] -enum ErrorKind { - /// An error that occurred while constructing an NFA as a precursor step - /// before a DFA is compiled. - NFA(nfa::thompson::Error), - /// An error that occurred because an unsupported regex feature was used. - /// The message string describes which unsupported feature was used. - /// - /// The primary regex feature that is unsupported by DFAs is the Unicode - /// word boundary look-around assertion (`\b`). This can be worked around - /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling the - /// [`dense::Builder::allow_unicode_word_boundary`](dense/struct.Builder.html#method.allow_unicode_word_boundary) - /// option when building a DFA. - Unsupported(&'static str), - /// An error that occurs if too many states are produced while building a - /// DFA. - TooManyStates, - /// An error that occurs if too many start states are needed while building - /// a DFA. - /// - /// This is a kind of oddball error that occurs when building a DFA with - /// start states enabled for each pattern and enough patterns to cause - /// the table of start states to overflow `usize`. - TooManyStartStates, - /// This is another oddball error that can occur if there are too many - /// patterns spread out across too many match states. - TooManyMatchPatternIDs, - /// An error that occurs if the DFA got too big during determinization. - DFAExceededSizeLimit { limit: usize }, - /// An error that occurs if auxiliary storage (not the DFA) used during - /// determinization got too big. - DeterminizeExceededSizeLimit { limit: usize }, -} - -impl Error { - /// Return the kind of this error. - fn kind(&self) -> &ErrorKind { - &self.kind - } - - pub(crate) fn nfa(err: nfa::thompson::Error) -> Error { - Error { kind: ErrorKind::NFA(err) } - } - - pub(crate) fn unsupported_dfa_word_boundary_unicode() -> Error { - let msg = "cannot build DFAs for regexes with Unicode word \ - boundaries; switch to ASCII word boundaries, or \ - heuristically enable Unicode word boundaries or use a \ - different regex engine"; - Error { kind: ErrorKind::Unsupported(msg) } - } - - pub(crate) fn too_many_states() -> Error { - Error { kind: ErrorKind::TooManyStates } - } - - pub(crate) fn too_many_start_states() -> Error { - Error { kind: ErrorKind::TooManyStartStates } - } - - pub(crate) fn too_many_match_pattern_ids() -> Error { - Error { kind: ErrorKind::TooManyMatchPatternIDs } - } - - pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> Error { - Error { kind: ErrorKind::DFAExceededSizeLimit { limit } } - } - - pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> Error { - Error { kind: ErrorKind::DeterminizeExceededSizeLimit { limit } } - } -} - -#[cfg(feature = "std")] -impl std::error::Error for Error { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self.kind() { - ErrorKind::NFA(ref err) => Some(err), - ErrorKind::Unsupported(_) => None, - ErrorKind::TooManyStates => None, - ErrorKind::TooManyStartStates => None, - ErrorKind::TooManyMatchPatternIDs => None, - ErrorKind::DFAExceededSizeLimit { .. } => None, - ErrorKind::DeterminizeExceededSizeLimit { .. } => None, - } - } -} - -impl core::fmt::Display for Error { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - match self.kind() { - ErrorKind::NFA(_) => write!(f, "error building NFA"), - ErrorKind::Unsupported(ref msg) => { - write!(f, "unsupported regex feature for DFAs: {}", msg) - } - ErrorKind::TooManyStates => write!( - f, - "number of DFA states exceeds limit of {}", - StateID::LIMIT, - ), - ErrorKind::TooManyStartStates => { - let stride = Start::count(); - // The start table has `stride` entries for starting states for - // the entire DFA, and then `stride` entries for each pattern - // if start states for each pattern are enabled (which is the - // only way this error can occur). Thus, the total number of - // patterns that can fit in the table is `stride` less than - // what we can allocate. - let limit = ((core::isize::MAX as usize) - stride) / stride; - write!( - f, - "compiling DFA with start states exceeds pattern \ - pattern limit of {}", - limit, - ) - } - ErrorKind::TooManyMatchPatternIDs => write!( - f, - "compiling DFA with total patterns in all match states \ - exceeds limit of {}", - PatternID::LIMIT, - ), - ErrorKind::DFAExceededSizeLimit { limit } => write!( - f, - "DFA exceeded size limit of {:?} during determinization", - limit, - ), - ErrorKind::DeterminizeExceededSizeLimit { limit } => { - write!(f, "determinization exceeded size limit of {:?}", limit) - } - } - } -} diff --git a/vendor/regex-automata/src/dfa/minimize.rs b/vendor/regex-automata/src/dfa/minimize.rs index 80e2f4e73..fea925bdc 100644 --- a/vendor/regex-automata/src/dfa/minimize.rs +++ b/vendor/regex-automata/src/dfa/minimize.rs @@ -6,7 +6,7 @@ use crate::{ dfa::{automaton::Automaton, dense, DEAD}, util::{ alphabet, - id::{PatternID, StateID}, + primitives::{PatternID, StateID}, }, }; @@ -152,13 +152,13 @@ impl<'a> Minimizer<'a> { // At this point, we now have a minimal partitioning of states, where // each partition is an equivalence class of DFA states. Now we need to - // use this partioning to update the DFA to only contain one state for + // use this partitioning to update the DFA to only contain one state for // each partition. // Create a map from DFA state ID to the representative ID of the // equivalence class to which it belongs. The representative ID of an // equivalence class of states is the minimum ID in that class. - let mut state_to_part = vec![DEAD; self.dfa.state_count()]; + let mut state_to_part = vec![DEAD; self.dfa.state_len()]; for p in &self.partitions { p.iter(|id| state_to_part[as_index(id)] = p.min()); } @@ -167,7 +167,7 @@ impl<'a> Minimizer<'a> { // create a map from equivalence IDs to the new IDs. Thus, the new // minimal ID of *any* state in the unminimized DFA can be obtained // with minimals_ids[state_to_part[old_id]]. - let mut minimal_ids = vec![DEAD; self.dfa.state_count()]; + let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; let mut new_index = 0; for state in self.dfa.states() { if state_to_part[as_index(state.id())] == state.id() { @@ -184,15 +184,13 @@ impl<'a> Minimizer<'a> { // Re-map this DFA in place such that the only states remaining // correspond to the representative states of every equivalence class. - for id in (0..self.dfa.state_count()).map(as_state_id) { + for id in (0..self.dfa.state_len()).map(as_state_id) { // If this state isn't a representative for an equivalence class, // then we skip it since it won't appear in the minimal DFA. if state_to_part[as_index(id)] != id { continue; } - for (_, next) in self.dfa.state_mut(id).iter_mut() { - *next = remap(*next); - } + self.dfa.remap_state(id, remap); self.dfa.swap_states(id, minimal_ids[as_index(id)]); } // Trim off all unused states from the pre-minimized DFA. This @@ -208,8 +206,12 @@ impl<'a> Minimizer<'a> { // We're already allocating so much that this is probably fine. If this // turns out to be costly, then I guess add a `starts_mut` iterator. let starts: Vec<_> = self.dfa.starts().collect(); - for (old_start_id, start_type, pid) in starts { - self.dfa.set_start_state(start_type, pid, remap(old_start_id)); + for (old_start_id, anchored, start_type) in starts { + self.dfa.set_start_state( + anchored, + start_type, + remap(old_start_id), + ); } // Update the match state pattern ID list for multi-regexes. All we @@ -305,7 +307,7 @@ impl<'a> Minimizer<'a> { for state in dfa.states() { if dfa.is_match_state(state.id()) { let mut pids = vec![]; - for i in 0..dfa.match_count(state.id()) { + for i in 0..dfa.match_len(state.id()) { pids.push(dfa.match_pattern(state.id(), i)); } matching diff --git a/vendor/regex-automata/src/dfa/mod.rs b/vendor/regex-automata/src/dfa/mod.rs index 6f9fe605e..4bb870435 100644 --- a/vendor/regex-automata/src/dfa/mod.rs +++ b/vendor/regex-automata/src/dfa/mod.rs @@ -1,5 +1,5 @@ /*! -A module for building and searching with determinstic finite automata (DFAs). +A module for building and searching with deterministic finite automata (DFAs). Like other modules in this crate, DFAs support a rich regex syntax with Unicode features. DFAs also have extensive options for configuring the best space vs @@ -26,20 +26,25 @@ DFAs implement. (A `regex::Regex` is generic over this trait.) [`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g., [`dense::DFA::from_bytes`]). +There is also a [`onepass`] module that provides a [one-pass +DFA](onepass::DFA). The unique advantage of this DFA is that, for the class +of regexes it can be built with, it supports reporting the spans of matching +capturing groups. It is the only DFA in this crate capable of such a thing. + # Example: basic regex searching This example shows how to compile a regex using the default configuration and then use it to find matches in a byte string: ``` -use regex_automata::{MultiMatch, dfa::regex::Regex}; +use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -51,36 +56,15 @@ simultaneously. You can use this support with standard leftmost-first style searching to find non-overlapping matches: ``` -use regex_automata::{MultiMatch, dfa::regex::Regex}; +# if cfg!(miri) { return Ok(()); } // miri takes too long +use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new_many(&[r"\w+", r"\S+"])?; let text = b"@foo bar"; -let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(1, 0, 4), - MultiMatch::must(0, 5, 8), -]); -# Ok::<(), Box<dyn std::error::Error>>(()) -``` - -Or use overlapping style searches to find all possible occurrences: - -``` -use regex_automata::{MatchKind, MultiMatch, dfa::{dense, regex::Regex}}; - -// N.B. For overlapping searches, we need the underlying DFA to report all -// possible matches. -let re = Regex::builder() - .dense(dense::Config::new().match_kind(MatchKind::All)) - .build_many(&[r"\w{3}", r"\S{3}"])?; -let text = b"@foo bar"; -let matches: Vec<MultiMatch> = re.find_overlapping_iter(text).collect(); -assert_eq!(matches, vec![ - MultiMatch::must(1, 0, 3), - MultiMatch::must(0, 1, 4), - MultiMatch::must(1, 1, 4), - MultiMatch::must(0, 5, 8), - MultiMatch::must(1, 5, 8), + Match::must(1, 0..4), + Match::must(0, 5..8), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -96,14 +80,14 @@ Using sparse DFAs is as easy as using `Regex::new_sparse` instead of `Regex::new`: ``` -use regex_automata::{MultiMatch, dfa::regex::Regex}; +use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -112,7 +96,7 @@ If you already have dense DFAs for some reason, they can be converted to sparse DFAs and used to build a new `Regex`. For example: ``` -use regex_automata::{MultiMatch, dfa::regex::Regex}; +use regex_automata::{Match, dfa::regex::Regex}; let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let sparse_re = Regex::builder().build_from_dfas( @@ -120,10 +104,10 @@ let sparse_re = Regex::builder().build_from_dfas( dense_re.reverse().to_sparse()?, ); let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = sparse_re.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = sparse_re.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -136,7 +120,7 @@ bit contrived, this same technique can be used in your program to deserialize a DFA at start up time or by memory mapping a file. ``` -use regex_automata::{MultiMatch, dfa::{dense, regex::Regex}}; +use regex_automata::{Match, dfa::{dense, regex::Regex}}; let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); // serialize both the forward and reverse DFAs, see note below @@ -150,10 +134,10 @@ let re2 = Regex::builder().build_from_dfas(fwd, rev); // we can use it like normal let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re2.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` @@ -183,7 +167,7 @@ valid DFA. The same process can be achieved with sparse DFAs as well: ``` -use regex_automata::{MultiMatch, dfa::{sparse, regex::Regex}}; +use regex_automata::{Match, dfa::{sparse, regex::Regex}}; let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); // serialize both @@ -197,17 +181,17 @@ let re2 = Regex::builder().build_from_dfas(fwd, rev); // we can use it like normal let text = b"2018-12-24 2016-10-08"; -let matches: Vec<MultiMatch> = re2.find_leftmost_iter(text).collect(); +let matches: Vec<Match> = re2.find_iter(text).collect(); assert_eq!(matches, vec![ - MultiMatch::must(0, 0, 10), - MultiMatch::must(0, 11, 21), + Match::must(0, 0..10), + Match::must(0, 11..21), ]); # Ok::<(), Box<dyn std::error::Error>>(()) ``` Note that unlike dense DFAs, sparse DFAs have no alignment requirements. Conversely, dense DFAs must be be aligned to the same alignment as a -[`StateID`](crate::util::id::StateID). +[`StateID`](crate::util::primitives::StateID). # Support for `no_std` and `alloc`-only @@ -232,8 +216,8 @@ you would any regex. Deserialization can happen anywhere. For example, with bytes embedded into a binary or with a file memory mapped at runtime. -TODO: Include link to `regex-cli` here pointing out how to generate Rust code -for deserializing DFAs. +The `regex-cli` command (found in the same repository as this crate) can be +used to serialize DFAs to files and generate Rust code to read them. # Syntax @@ -283,7 +267,7 @@ the regexes in this module are almost universally slow to compile, especially when they contain large Unicode character classes. For example, on my system, compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling a sparse regex takes about the same time but only uses about 1.2MB of -memory.) Conversly, compiling the same regex without Unicode support, e.g., +memory.) Conversely, compiling the same regex without Unicode support, e.g., `(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this reason, you should only use Unicode character classes if you absolutely need them! (They are enabled by default though.) @@ -299,10 +283,10 @@ optimizations means that searches may run much slower than what you're accustomed to, although, it does provide more predictable and consistent performance. * There is no `&str` API like in the regex crate. In this module, all APIs -operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8 -boundaries, unless any of [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8), -[`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) or -[`regex::Config::utf8`] are disabled. +operate on `&[u8]`. By default, match indices are +guaranteed to fall on UTF-8 boundaries, unless either of +[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or +[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled. With some of the downsides out of the way, here are some positive differences: @@ -334,9 +318,11 @@ via [`dense::Config::minimize`], but it can increase compilation times dramatically. */ -pub use crate::dfa::automaton::{Automaton, OverlappingState}; -#[cfg(feature = "alloc")] -pub use crate::dfa::error::Error; +#[cfg(feature = "dfa-search")] +pub use crate::dfa::{ + automaton::{Automaton, OverlappingState}, + start::StartKind, +}; /// This is an alias for a state ID of zero. It has special significance /// because it always corresponds to the first state in a DFA, and the first @@ -344,20 +330,31 @@ pub use crate::dfa::error::Error; /// of its transitions set to itself. Moreover, the dead state is used as a /// sentinel for various things. e.g., In search, reaching a dead state means /// that the search must stop. -const DEAD: crate::util::id::StateID = crate::util::id::StateID::ZERO; +const DEAD: crate::util::primitives::StateID = + crate::util::primitives::StateID::ZERO; -mod accel; -mod automaton; +#[cfg(feature = "dfa-search")] pub mod dense; -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-onepass")] +pub mod onepass; +#[cfg(feature = "dfa-search")] +pub mod regex; +#[cfg(feature = "dfa-search")] +pub mod sparse; + +#[cfg(feature = "dfa-search")] +pub(crate) mod accel; +#[cfg(feature = "dfa-search")] +mod automaton; +#[cfg(feature = "dfa-build")] mod determinize; -#[cfg(feature = "alloc")] -pub(crate) mod error; -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] mod minimize; -pub mod regex; +#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))] +mod remapper; +#[cfg(feature = "dfa-search")] mod search; -pub mod sparse; +#[cfg(feature = "dfa-search")] mod special; -#[cfg(feature = "transducer")] -mod transducer; +#[cfg(feature = "dfa-search")] +mod start; diff --git a/vendor/regex-automata/src/dfa/onepass.rs b/vendor/regex-automata/src/dfa/onepass.rs new file mode 100644 index 000000000..44691d0c8 --- /dev/null +++ b/vendor/regex-automata/src/dfa/onepass.rs @@ -0,0 +1,3188 @@ +/*! +A DFA that can return spans for matching capturing groups. + +This module is the home of a [one-pass DFA](DFA). + +This module also contains a [`Builder`] and a [`Config`] for building and +configuring a one-pass DFA. +*/ + +// A note on naming and credit: +// +// As far as I know, Russ Cox came up with the practical vision and +// implementation of a "one-pass regex engine." He mentions and describes it +// briefly in the third article of his regexp article series: +// https://swtch.com/~rsc/regexp/regexp3.html +// +// Cox's implementation is in RE2, and the implementation below is most +// heavily inspired by RE2's. The key thing they have in common is that +// their transitions are defined over an alphabet of bytes. In contrast, +// Go's regex engine also has a one-pass engine, but its transitions are +// more firmly rooted on Unicode codepoints. The ideas are the same, but the +// implementations are different. +// +// RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA." +// They're both true in their own ways: +// +// * The "one-pass" criterion is generally a property of the NFA itself. In +// particular, it is said that an NFA is one-pass if, after each byte of input +// during a search, there is at most one "VM thread" remaining to take for the +// next byte of input. That is, there is never any ambiguity as to the path to +// take through the NFA during a search. +// +// * On the other hand, once a one-pass NFA has its representation converted +// to something where a constant number of instructions is used for each byte +// of input, the implementation looks a lot more like a DFA. It's technically +// more powerful than a DFA since it has side effects (storing offsets inside +// of slots activated by a transition), but it is far closer to a DFA than an +// NFA simulation. +// +// Thus, in this crate, we call it a one-pass DFA. + +use alloc::{vec, vec::Vec}; + +use crate::{ + dfa::{remapper::Remapper, DEAD}, + nfa::thompson::{self, NFA}, + util::{ + alphabet::ByteClasses, + captures::Captures, + escape::DebugByte, + int::{Usize, U32, U64, U8}, + look::{Look, LookSet, UnicodeWordBoundaryError}, + primitives::{NonMaxUsize, PatternID, StateID}, + search::{Anchored, Input, Match, MatchError, MatchKind, Span}, + sparse_set::SparseSet, + }, +}; + +/// The configuration used for building a [one-pass DFA](DFA). +/// +/// A one-pass DFA configuration is a simple data object that is typically used +/// with [`Builder::configure`]. It can be cheaply cloned. +/// +/// A default configuration can be created either with `Config::new`, or +/// perhaps more conveniently, with [`DFA::config`]. +#[derive(Clone, Debug, Default)] +pub struct Config { + match_kind: Option<MatchKind>, + starts_for_each_pattern: Option<bool>, + byte_classes: Option<bool>, + size_limit: Option<Option<usize>>, +} + +impl Config { + /// Return a new default one-pass DFA configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to "classical DFA" construction + /// where all possible matches are visited. + /// + /// When it comes to the one-pass DFA, it is rarer for preference order and + /// "longest match" to actually disagree. Since if they did disagree, then + /// the regex typically isn't one-pass. For example, searching `Samwise` + /// for `Sam|Samwise` will report `Sam` for leftmost-first matching and + /// `Samwise` for "longest match" or "all" matching. However, this regex is + /// not one-pass if taken literally. The equivalent regex, `Sam(?:|wise)` + /// is one-pass and `Sam|Samwise` may be optimized to it. + /// + /// The other main difference is that "all" match semantics don't support + /// non-greedy matches. "All" match semantics always try to match as much + /// as possible. + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// Whether to compile a separate start state for each pattern in the + /// one-pass DFA. + /// + /// When enabled, a separate **anchored** start state is added for each + /// pattern in the DFA. When this start state is used, then the DFA will + /// only search for matches for the pattern specified, even if there are + /// other patterns in the DFA. + /// + /// The main downside of this option is that it can potentially increase + /// the size of the DFA and/or increase the time it takes to build the DFA. + /// + /// You might want to enable this option when you want to both search for + /// anchored matches of any pattern or to search for anchored matches of + /// one particular pattern while using the same DFA. (Otherwise, you would + /// need to compile a new DFA for each pattern.) + /// + /// By default this is disabled. + /// + /// # Example + /// + /// This example shows how to build a multi-regex and then search for + /// matches for a any of the patterns or matches for a specific pattern. + /// + /// ``` + /// use regex_automata::{ + /// dfa::onepass::DFA, Anchored, Input, Match, PatternID, + /// }; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&["[a-z]+", "[0-9]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "123abc"; + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// + /// // A normal multi-pattern search will show pattern 1 matches. + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// // If we only want to report pattern 0 matches, then we'll get no + /// // match here. + /// let input = input.anchored(Anchored::Pattern(PatternID::must(0))); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { + self.starts_for_each_pattern = Some(yes); + self + } + + /// Whether to attempt to shrink the size of the DFA's alphabet or not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging a one-pass DFA. + /// + /// When enabled, the DFA will use a map from all possible bytes to their + /// corresponding equivalence class. Each equivalence class represents a + /// set of bytes that does not discriminate between a match and a non-match + /// in the DFA. For example, the pattern `[ab]+` has at least two + /// equivalence classes: a set containing `a` and `b` and a set containing + /// every byte except for `a` and `b`. `a` and `b` are in the same + /// equivalence class because they never discriminate between a match and a + /// non-match. + /// + /// The advantage of this map is that the size of the transition table + /// can be reduced drastically from (approximately) `#states * 256 * + /// sizeof(StateID)` to `#states * k * sizeof(StateID)` where `k` is the + /// number of equivalence classes (rounded up to the nearest power of 2). + /// As a result, total space usage can decrease substantially. Moreover, + /// since a smaller alphabet is used, DFA compilation becomes faster as + /// well. + /// + /// **WARNING:** This is only useful for debugging DFAs. Disabling this + /// does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(mut self, yes: bool) -> Config { + self.byte_classes = Some(yes); + self + } + + /// Set a size limit on the total heap used by a one-pass DFA. + /// + /// This size limit is expressed in bytes and is applied during + /// construction of a one-pass DFA. If the DFA's heap usage exceeds + /// this configured limit, then construction is stopped and an error is + /// returned. + /// + /// The default is no limit. + /// + /// # Example + /// + /// This example shows a one-pass DFA that fails to build because of + /// a configured size limit. This particular example also serves as a + /// cautionary tale demonstrating just how big DFAs with large Unicode + /// character classes can get. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// // 6MB isn't enough! + /// DFA::builder() + /// .configure(DFA::config().size_limit(Some(6_000_000))) + /// .build(r"\w{20}") + /// .unwrap_err(); + /// + /// // ... but 7MB probably is! + /// // (Note that DFA sizes aren't necessarily stable between releases.) + /// let re = DFA::builder() + /// .configure(DFA::config().size_limit(Some(7_000_000))) + /// .build(r"\w{20}")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "A".repeat(20); + /// re.captures(&mut cache, &haystack, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..20)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// While one needs a little more than 3MB to represent `\w{20}`, it + /// turns out that you only need a little more than 4KB to represent + /// `(?-u:\w{20})`. So only use Unicode if you need it! + pub fn size_limit(mut self, limit: Option<usize>) -> Config { + self.size_limit = Some(limit); + self + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns whether this configuration has enabled anchored starting states + /// for every pattern in the DFA. + pub fn get_starts_for_each_pattern(&self) -> bool { + self.starts_for_each_pattern.unwrap_or(false) + } + + /// Returns whether this configuration has enabled byte classes or not. + /// This is typically a debugging oriented option, as disabling it confers + /// no speed benefit. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns the DFA size limit of this configuration if one was set. + /// The size limit is total number of bytes on the heap that a DFA is + /// permitted to use. If the DFA exceeds this limit during construction, + /// then construction is stopped and an error is returned. + pub fn get_size_limit(&self) -> Option<usize> { + self.size_limit.unwrap_or(None) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { + Config { + match_kind: o.match_kind.or(self.match_kind), + starts_for_each_pattern: o + .starts_for_each_pattern + .or(self.starts_for_each_pattern), + byte_classes: o.byte_classes.or(self.byte_classes), + size_limit: o.size_limit.or(self.size_limit), + } + } +} + +/// A builder for a [one-pass DFA](DFA). +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction and the DFA construction. This builder is different from a +/// general purpose regex builder in that it permits fine grain configuration +/// of the construction process. The trade off for this is complexity, and +/// the possibility of setting a configuration that might not make sense. For +/// example, there are two different UTF-8 modes: +/// +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`] controls whether empty matches that split a +/// Unicode codepoint are reported or not. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the NFA. +/// This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// dfa::onepass::DFA, +/// nfa::thompson, +/// util::syntax, +/// Match, +/// }; +/// +/// let re = DFA::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n"; +/// re.captures(&mut cache, haystack, &mut caps); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on a one-pass DFA Config, +/// // since that only impacts regexes that can +/// // produce matches of length 0. +/// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, +} + +impl Builder { + /// Create a new one-pass DFA builder with the default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), + } + } + + /// Build a one-pass DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a one-pass DFA from the given patterns. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<DFA, BuildError> { + let nfa = + self.thompson.build_many(patterns).map_err(BuildError::nfa)?; + self.build_from_nfa(nfa) + } + + /// Build a DFA from the given NFA. + /// + /// # Example + /// + /// This example shows how to build a DFA if you already have an NFA in + /// hand. + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match}; + /// + /// // This shows how to set non-default options for building an NFA. + /// let nfa = NFA::compiler() + /// .configure(NFA::config().shrink(true)) + /// .build(r"[a-z0-9]+")?; + /// let re = DFA::builder().build_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// re.captures(&mut cache, "foo123bar", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..9)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_nfa(&self, nfa: NFA) -> Result<DFA, BuildError> { + // Why take ownership if we're just going to pass a reference to the + // NFA to our internal builder? Well, the first thing to note is that + // an NFA uses reference counting internally, so either choice is going + // to be cheap. So there isn't much cost either way. + // + // The real reason is that a one-pass DFA, semantically, shares + // ownership of an NFA. This is unlike other DFAs that don't share + // ownership of an NFA at all, primarily because they want to be + // self-contained in order to support cheap (de)serialization. + // + // But then why pass a '&nfa' below if we want to share ownership? + // Well, it turns out that using a '&NFA' in our internal builder + // separates its lifetime from the DFA we're building, and this turns + // out to make code a bit more composable. e.g., We can iterate over + // things inside the NFA while borrowing the builder as mutable because + // we know the NFA cannot be mutated. So TL;DR --- this weirdness is + // "because borrow checker." + InternalBuilder::new(self.config.clone(), &nfa).build() + } + + /// Apply the given one-pass DFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a one-pass DFA directly + /// from a pattern. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +/// An internal builder for encapsulating the state necessary to build a +/// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`. +/// +/// There is no separate pass for determining whether the NFA is one-pass or +/// not. We just try to build the DFA. If during construction we discover that +/// it is not one-pass, we bail out. This is likely to lead to some undesirable +/// expense in some cases, so it might make sense to try an identify common +/// patterns in the NFA that make it definitively not one-pass. That way, we +/// can avoid ever trying to build a one-pass DFA in the first place. For +/// example, '\w*\s' is not one-pass, and since '\w' is Unicode-aware by +/// default, it's probably not a trivial cost to try and build a one-pass DFA +/// for it and then fail. +/// +/// Note that some (immutable) fields are duplicated here. For example, the +/// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing, +/// but we duplicate them because it makes composition easier below. Otherwise, +/// since the borrow checker can't see through method calls, the mutable borrow +/// we use to mutate the DFA winds up preventing borrowing from any other part +/// of the DFA, even though we aren't mutating those parts. We only do this +/// because the duplication is cheap. +#[derive(Debug)] +struct InternalBuilder<'a> { + /// The DFA we're building. + dfa: DFA, + /// An unordered collection of NFA state IDs that we haven't yet tried to + /// build into a DFA state yet. + /// + /// This collection does not ultimately wind up including every NFA state + /// ID. Instead, each ID represents a "start" state for a sub-graph of the + /// NFA. The set of NFA states we then use to build a DFA state consists + /// of that "start" state and all states reachable from it via epsilon + /// transitions. + uncompiled_nfa_ids: Vec<StateID>, + /// A map from NFA state ID to DFA state ID. This is useful for easily + /// determining whether an NFA state has been used as a "starting" point + /// to build a DFA state yet. If it hasn't, then it is mapped to DEAD, + /// and since DEAD is specially added and never corresponds to any NFA + /// state, it follows that a mapping to DEAD implies the NFA state has + /// no corresponding DFA state yet. + nfa_to_dfa_id: Vec<StateID>, + /// A stack used to traverse the NFA states that make up a single DFA + /// state. Traversal occurs until the stack is empty, and we only push to + /// the stack when the state ID isn't in 'seen'. Actually, even more than + /// that, if we try to push something on to this stack that is already in + /// 'seen', then we bail out on construction completely, since it implies + /// that the NFA is not one-pass. + stack: Vec<(StateID, Epsilons)>, + /// The set of NFA states that we've visited via 'stack'. + seen: SparseSet, + /// Whether a match NFA state has been observed while constructing a + /// one-pass DFA state. Once a match state is seen, assuming we are using + /// leftmost-first match semantics, then we don't add any more transitions + /// to the DFA state we're building. + matched: bool, + /// The config passed to the builder. + /// + /// This is duplicated in dfa.config. + config: Config, + /// The NFA we're building a one-pass DFA from. + /// + /// This is duplicated in dfa.nfa. + nfa: &'a NFA, + /// The equivalence classes that make up the alphabet for this DFA> + /// + /// This is duplicated in dfa.classes. + classes: ByteClasses, +} + +impl<'a> InternalBuilder<'a> { + /// Create a new builder with an initial empty DFA. + fn new(config: Config, nfa: &'a NFA) -> InternalBuilder { + let classes = if !config.get_byte_classes() { + // A one-pass DFA will always use the equivalence class map, but + // enabling this option is useful for debugging. Namely, this will + // cause all transitions to be defined over their actual bytes + // instead of an opaque equivalence class identifier. The former is + // much easier to grok as a human. + ByteClasses::singletons() + } else { + nfa.byte_classes().clone() + }; + // Normally a DFA alphabet includes the EOI symbol, but we don't need + // that in the one-pass DFA since we handle look-around explicitly + // without encoding it into the DFA. Thus, we don't need to delay + // matches by 1 byte. However, we reuse the space that *would* be used + // by the EOI transition by putting match information there (like which + // pattern matches and which look-around assertions need to hold). So + // this means our real alphabet length is 1 fewer than what the byte + // classes report, since we don't use EOI. + let alphabet_len = classes.alphabet_len().checked_sub(1).unwrap(); + let stride2 = classes.stride2(); + let dfa = DFA { + config: config.clone(), + nfa: nfa.clone(), + table: vec![], + starts: vec![], + // Since one-pass DFAs have a smaller state ID max than + // StateID::MAX, it follows that StateID::MAX is a valid initial + // value for min_match_id since no state ID can ever be greater + // than it. In the case of a one-pass DFA with no match states, the + // min_match_id will keep this sentinel value. + min_match_id: StateID::MAX, + classes: classes.clone(), + alphabet_len, + stride2, + pateps_offset: alphabet_len, + // OK because PatternID::MAX*2 is guaranteed not to overflow. + explicit_slot_start: nfa.pattern_len().checked_mul(2).unwrap(), + }; + InternalBuilder { + dfa, + uncompiled_nfa_ids: vec![], + nfa_to_dfa_id: vec![DEAD; nfa.states().len()], + stack: vec![], + seen: SparseSet::new(nfa.states().len()), + matched: false, + config, + nfa, + classes, + } + } + + /// Build the DFA from the NFA given to this builder. If the NFA is not + /// one-pass, then return an error. An error may also be returned if a + /// particular limit is exceeded. (Some limits, like the total heap memory + /// used, are configurable. Others, like the total patterns or slots, are + /// hard-coded based on representational limitations.) + fn build(mut self) -> Result<DFA, BuildError> { + self.nfa.look_set_any().available().map_err(BuildError::word)?; + for look in self.nfa.look_set_any().iter() { + // This is a future incompatibility check where if we add any + // more look-around assertions, then the one-pass DFA either + // needs to reject them (what we do here) or it needs to have its + // Transition representation modified to be capable of storing the + // new assertions. + if look.as_repr() > Look::WordUnicodeNegate.as_repr() { + return Err(BuildError::unsupported_look(look)); + } + } + if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT + { + return Err(BuildError::too_many_patterns( + PatternEpsilons::PATTERN_ID_LIMIT, + )); + } + if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT { + return Err(BuildError::not_one_pass( + "too many explicit capturing groups (max is 16)", + )); + } + assert_eq!(DEAD, self.add_empty_state()?); + + // This is where the explicit slots start. We care about this because + // we only need to track explicit slots. The implicit slots---two for + // each pattern---are tracked as part of the search routine itself. + let explicit_slot_start = self.nfa.pattern_len() * 2; + self.add_start_state(None, self.nfa.start_anchored())?; + if self.config.get_starts_for_each_pattern() { + for pid in self.nfa.patterns() { + self.add_start_state( + Some(pid), + self.nfa.start_pattern(pid).unwrap(), + )?; + } + } + // NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids' + // as a stack are. It is really an unordered *set* of NFA state IDs. + // If it, for example, in practice led to discovering whether a regex + // was or wasn't one-pass later than if we processed NFA state IDs in + // ascending order, then that would make this routine more costly in + // the somewhat common case of a regex that isn't one-pass. + while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() { + let dfa_id = self.nfa_to_dfa_id[nfa_id]; + // Once we see a match, we keep going, but don't add any new + // transitions. Normally we'd just stop, but we have to keep + // going in order to verify that our regex is actually one-pass. + self.matched = false; + // The NFA states we've already explored for this DFA state. + self.seen.clear(); + // The NFA states to explore via epsilon transitions. If we ever + // try to push an NFA state that we've already seen, then the NFA + // is not one-pass because it implies there are multiple epsilon + // transition paths that lead to the same NFA state. In other + // words, there is ambiguity. + self.stack_push(nfa_id, Epsilons::empty())?; + while let Some((id, epsilons)) = self.stack.pop() { + match *self.nfa.state(id) { + thompson::State::ByteRange { ref trans } => { + self.compile_transition(dfa_id, trans, epsilons)?; + } + thompson::State::Sparse(ref sparse) => { + for trans in sparse.transitions.iter() { + self.compile_transition(dfa_id, trans, epsilons)?; + } + } + thompson::State::Dense(ref dense) => { + for trans in dense.iter() { + self.compile_transition(dfa_id, &trans, epsilons)?; + } + } + thompson::State::Look { look, next } => { + let looks = epsilons.looks().insert(look); + self.stack_push(next, epsilons.set_looks(looks))?; + } + thompson::State::Union { ref alternates } => { + for &sid in alternates.iter().rev() { + self.stack_push(sid, epsilons)?; + } + } + thompson::State::BinaryUnion { alt1, alt2 } => { + self.stack_push(alt2, epsilons)?; + self.stack_push(alt1, epsilons)?; + } + thompson::State::Capture { next, slot, .. } => { + let slot = slot.as_usize(); + let epsilons = if slot < explicit_slot_start { + // If this is an implicit slot, we don't care + // about it, since we handle implicit slots in + // the search routine. We can get away with that + // because there are 2 implicit slots for every + // pattern. + epsilons + } else { + // Offset our explicit slots so that they start + // at index 0. + let offset = slot - explicit_slot_start; + epsilons.set_slots(epsilons.slots().insert(offset)) + }; + self.stack_push(next, epsilons)?; + } + thompson::State::Fail => { + continue; + } + thompson::State::Match { pattern_id } => { + // If we found two different paths to a match state + // for the same DFA state, then we have ambiguity. + // Thus, it's not one-pass. + if self.matched { + return Err(BuildError::not_one_pass( + "multiple epsilon transitions to match state", + )); + } + self.matched = true; + // Shove the matching pattern ID and the 'epsilons' + // into the current DFA state's pattern epsilons. The + // 'epsilons' includes the slots we need to capture + // before reporting the match and also the conditional + // epsilon transitions we need to check before we can + // report a match. + self.dfa.set_pattern_epsilons( + dfa_id, + PatternEpsilons::empty() + .set_pattern_id(pattern_id) + .set_epsilons(epsilons), + ); + // N.B. It is tempting to just bail out here when + // compiling a leftmost-first DFA, since we will never + // compile any more transitions in that case. But we + // actually need to keep going in order to verify that + // we actually have a one-pass regex. e.g., We might + // see more Match states (e.g., for other patterns) + // that imply that we don't have a one-pass regex. + // So instead, we mark that we've found a match and + // continue on. When we go to compile a new DFA state, + // we just skip that part. But otherwise check that the + // one-pass property is upheld. + } + } + } + } + self.shuffle_states(); + Ok(self.dfa) + } + + /// Shuffle all match states to the end of the transition table and set + /// 'min_match_id' to the ID of the first such match state. + /// + /// The point of this is to make it extremely cheap to determine whether + /// a state is a match state or not. We need to check on this on every + /// transition during a search, so it being cheap is important. This + /// permits us to check it by simply comparing two state identifiers, as + /// opposed to looking for the pattern ID in the state's `PatternEpsilons`. + /// (Which requires a memory load and some light arithmetic.) + fn shuffle_states(&mut self) { + let mut remapper = Remapper::new(&self.dfa); + let mut next_dest = self.dfa.last_state_id(); + for i in (0..self.dfa.state_len()).rev() { + let id = StateID::must(i); + let is_match = + self.dfa.pattern_epsilons(id).pattern_id().is_some(); + if !is_match { + continue; + } + remapper.swap(&mut self.dfa, next_dest, id); + self.dfa.min_match_id = next_dest; + next_dest = self.dfa.prev_state_id(next_dest).expect( + "match states should be a proper subset of all states", + ); + } + remapper.remap(&mut self.dfa); + } + + /// Compile the given NFA transition into the DFA state given. + /// + /// 'Epsilons' corresponds to any conditional epsilon transitions that need + /// to be satisfied to follow this transition, and any slots that need to + /// be saved if the transition is followed. + /// + /// If this transition indicates that the NFA is not one-pass, then + /// this returns an error. (This occurs, for example, if the DFA state + /// already has a transition defined for the same input symbols as the + /// given transition, *and* the result of the old and new transitions is + /// different.) + fn compile_transition( + &mut self, + dfa_id: StateID, + trans: &thompson::Transition, + epsilons: Epsilons, + ) -> Result<(), BuildError> { + let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?; + for byte in self + .classes + .representatives(trans.start..=trans.end) + .filter_map(|r| r.as_u8()) + { + let oldtrans = self.dfa.transition(dfa_id, byte); + let newtrans = + Transition::new(self.matched, next_dfa_id, epsilons); + // If the old transition points to the DEAD state, then we know + // 'byte' has not been mapped to any transition for this DFA state + // yet. So set it unconditionally. Otherwise, we require that the + // old and new transitions are equivalent. Otherwise, there is + // ambiguity and thus the regex is not one-pass. + if oldtrans.state_id() == DEAD { + self.dfa.set_transition(dfa_id, byte, newtrans); + } else if oldtrans != newtrans { + return Err(BuildError::not_one_pass( + "conflicting transition", + )); + } + } + Ok(()) + } + + /// Add a start state to the DFA corresponding to the given NFA starting + /// state ID. + /// + /// If adding a state would blow any limits (configured or hard-coded), + /// then an error is returned. + /// + /// If the starting state is an anchored state for a particular pattern, + /// then callers must provide the pattern ID for that starting state. + /// Callers must also ensure that the first starting state added is the + /// start state for all patterns, and then each anchored starting state for + /// each pattern (if necessary) added in order. Otherwise, this panics. + fn add_start_state( + &mut self, + pid: Option<PatternID>, + nfa_id: StateID, + ) -> Result<StateID, BuildError> { + match pid { + // With no pid, this should be the start state for all patterns + // and thus be the first one. + None => assert!(self.dfa.starts.is_empty()), + // With a pid, we want it to be at self.dfa.starts[pid+1]. + Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()), + } + let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?; + self.dfa.starts.push(dfa_id); + Ok(dfa_id) + } + + /// Add a new DFA state corresponding to the given NFA state. If adding a + /// state would blow any limits (configured or hard-coded), then an error + /// is returned. If a DFA state already exists for the given NFA state, + /// then that DFA state's ID is returned and no new states are added. + /// + /// It is not expected that this routine is called for every NFA state. + /// Instead, an NFA state ID will usually correspond to the "start" state + /// for a sub-graph of the NFA, where all states in the sub-graph are + /// reachable via epsilon transitions (conditional or unconditional). That + /// sub-graph of NFA states is ultimately what produces a single DFA state. + fn add_dfa_state_for_nfa_state( + &mut self, + nfa_id: StateID, + ) -> Result<StateID, BuildError> { + // If we've already built a DFA state for the given NFA state, then + // just return that. We definitely do not want to have more than one + // DFA state in existence for the same NFA state, since all but one of + // them will likely become unreachable. And at least some of them are + // likely to wind up being incomplete. + let existing_dfa_id = self.nfa_to_dfa_id[nfa_id]; + if existing_dfa_id != DEAD { + return Ok(existing_dfa_id); + } + // If we don't have any DFA state yet, add it and then add the given + // NFA state to the list of states to explore. + let dfa_id = self.add_empty_state()?; + self.nfa_to_dfa_id[nfa_id] = dfa_id; + self.uncompiled_nfa_ids.push(nfa_id); + Ok(dfa_id) + } + + /// Unconditionally add a new empty DFA state. If adding it would exceed + /// any limits (configured or hard-coded), then an error is returned. The + /// ID of the new state is returned on success. + /// + /// The added state is *not* a match state. + fn add_empty_state(&mut self) -> Result<StateID, BuildError> { + let state_limit = Transition::STATE_ID_LIMIT; + // Note that unlike dense and lazy DFAs, we specifically do NOT + // premultiply our state IDs here. The reason is that we want to pack + // our state IDs into 64-bit transitions with other info, so the fewer + // the bits we use for state IDs the better. If we premultiply, then + // our state ID space shrinks. We justify this by the assumption that + // a one-pass DFA is just already doing a fair bit more work than a + // normal DFA anyway, so an extra multiplication to compute a state + // transition doesn't seem like a huge deal. + let next_id = self.dfa.table.len() >> self.dfa.stride2(); + let id = StateID::new(next_id) + .map_err(|_| BuildError::too_many_states(state_limit))?; + if id.as_u64() > Transition::STATE_ID_LIMIT { + return Err(BuildError::too_many_states(state_limit)); + } + self.dfa + .table + .extend(core::iter::repeat(Transition(0)).take(self.dfa.stride())); + // The default empty value for 'PatternEpsilons' is sadly not all + // zeroes. Instead, a special sentinel is used to indicate that there + // is no pattern. So we need to explicitly set the pattern epsilons to + // the correct "empty" PatternEpsilons. + self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty()); + if let Some(size_limit) = self.config.get_size_limit() { + if self.dfa.memory_usage() > size_limit { + return Err(BuildError::exceeded_size_limit(size_limit)); + } + } + Ok(id) + } + + /// Push the given NFA state ID and its corresponding epsilons (slots and + /// conditional epsilon transitions) on to a stack for use in a depth first + /// traversal of a sub-graph of the NFA. + /// + /// If the given NFA state ID has already been pushed on to the stack, then + /// it indicates the regex is not one-pass and this correspondingly returns + /// an error. + fn stack_push( + &mut self, + nfa_id: StateID, + epsilons: Epsilons, + ) -> Result<(), BuildError> { + // If we already have seen a match and we are compiling a leftmost + // first DFA, then we shouldn't add any more states to look at. This is + // effectively how preference order and non-greediness is implemented. + // if !self.config.get_match_kind().continue_past_first_match() + // && self.matched + // { + // return Ok(()); + // } + if !self.seen.insert(nfa_id) { + return Err(BuildError::not_one_pass( + "multiple epsilon transitions to same state", + )); + } + self.stack.push((nfa_id, epsilons)); + Ok(()) + } +} + +/// A one-pass DFA for executing a subset of anchored regex searches while +/// resolving capturing groups. +/// +/// A one-pass DFA can be built from an NFA that is one-pass. An NFA is +/// one-pass when there is never any ambiguity about how to continue a search. +/// For example, `a*a` is not one-pass becuase during a search, it's not +/// possible to know whether to continue matching the `a*` or to move on to +/// the single `a`. However, `a*b` is one-pass, because for every byte in the +/// input, it's always clear when to move on from `a*` to `b`. +/// +/// # Only anchored searches are supported +/// +/// In this crate, especially for DFAs, unanchored searches are implemented by +/// treating the pattern as if it had a `(?s-u:.)*?` prefix. While the prefix +/// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)*?a` will +/// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches +/// any byte, and there is therefore ambiguity as to when the prefix should +/// stop matching and something else should start matching. +/// +/// Therefore, one-pass DFAs do not support unanchored searches. In addition +/// to many regexes simply not being one-pass, it implies that one-pass DFAs +/// have limited utility. With that said, when a one-pass DFA can be used, it +/// can potentially provide a dramatic speed up over alternatives like the +/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) +/// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular, +/// a one-pass DFA is the only DFA capable of reporting the spans of matching +/// capturing groups. +/// +/// To clarify, when we say that unanchored searches are not supported, what +/// that actually means is: +/// +/// * The high level routines, [`DFA::is_match`] and [`DFA::captures`], always +/// do anchored searches. +/// * Since iterators are most useful in the context of unanchored searches, +/// there is no `DFA::captures_iter` method. +/// * For lower level routines like [`DFA::try_search`], an error will be +/// returned if the given [`Input`] is configured to do an unanchored search or +/// search for an invalid pattern ID. (Note that an [`Input`] is configured to +/// do an unanchored search by default, so just giving a `Input::new` is +/// guaranteed to return an error.) +/// +/// # Other limitations +/// +/// In addition to the [configurable heap limit](Config::size_limit) and +/// the requirement that a regex pattern be one-pass, there are some other +/// limitations: +/// +/// * There is an internal limit on the total number of explicit capturing +/// groups that appear across all patterns. It is somewhat small and there is +/// no way to configure it. If your pattern(s) exceed this limit, then building +/// a one-pass DFA will fail. +/// * If the number of patterns exceeds an internal unconfigurable limit, then +/// building a one-pass DFA will fail. This limit is quite large and you're +/// unlikely to hit it. +/// * If the total number of states exceeds an internal unconfigurable limit, +/// then building a one-pass DFA will fail. This limit is quite large and +/// you're unlikely to hit it. +/// +/// # Other examples of regexes that aren't one-pass +/// +/// One particularly unfortunate example is that enabling Unicode can cause +/// regexes that were one-pass to no longer be one-pass. Consider the regex +/// `(?-u)\w*\s` for example. It is one-pass because there is exactly no +/// overlap between the ASCII definitions of `\w` and `\s`. But `\w*\s` +/// (i.e., with Unicode enabled) is *not* one-pass because `\w` and `\s` get +/// translated to UTF-8 automatons. And while the *codepoints* in `\w` and `\s` +/// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the +/// overlap between UTF-8 automata, the use of Unicode character classes will +/// tend to vastly increase the likelihood of a regex not being one-pass. +/// +/// # How does one know if a regex is one-pass or not? +/// +/// At the time of writing, the only way to know is to try and build a one-pass +/// DFA. The one-pass property is checked while constructing the DFA. +/// +/// This does mean that you might potentially waste some CPU cycles and memory +/// by optimistically trying to build a one-pass DFA. But this is currently the +/// only way. In the future, building a one-pass DFA might be able to use some +/// heuristics to detect common violations of the one-pass property and bail +/// more quickly. +/// +/// # Resource usage +/// +/// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource +/// usage. Namely, construction of a one-pass DFA has a time and space +/// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time +/// and space complexity is `O(2^n)`.) This smaller time bound is achieved +/// because there is at most one DFA state created for each NFA state. If +/// additional DFA states would be required, then the pattern is not one-pass +/// and construction will fail. +/// +/// Note though that currently, this DFA uses a fully dense representation. +/// This means that while its space complexity is no worse than an NFA, it may +/// in practice use more memory because of higher constant factors. The reason +/// for this trade off is two-fold. Firstly, a dense representation makes the +/// search faster. Secondly, the bigger an NFA, the more unlikely it is to be +/// one-pass. Therefore, most one-pass DFAs are usually pretty small. +/// +/// # Example +/// +/// This example shows that the one-pass DFA implements Unicode word boundaries +/// correctly while simultaneously reporting spans for capturing groups that +/// participate in a match. (This is the only DFA that implements full support +/// for Unicode word boundaries.) +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{dfa::onepass::DFA, Match, Span}; +/// +/// let re = DFA::new(r"\b(?P<first>\w+)[[:space:]]+(?P<last>\w+)\b")?; +/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); +/// +/// re.captures(&mut cache, "Шерлок Холмс", &mut caps); +/// assert_eq!(Some(Match::must(0, 0..23)), caps.get_match()); +/// assert_eq!(Some(Span::from(0..12)), caps.get_group_by_name("first")); +/// assert_eq!(Some(Span::from(13..23)), caps.get_group_by_name("last")); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: iteration +/// +/// Unlike other regex engines in this crate, this one does not provide +/// iterator search functions. This is because a one-pass DFA only supports +/// anchored searches, and so iterator functions are generally not applicable. +/// +/// However, if you know that all of your matches are +/// directly adjacent, then an iterator can be used. The +/// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for +/// this purpose: +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// dfa::onepass::DFA, +/// util::iter::Searcher, +/// Anchored, Input, Span, +/// }; +/// +/// let re = DFA::new(r"\w(\d)\w")?; +/// let (mut cache, caps) = (re.create_cache(), re.create_captures()); +/// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes); +/// +/// let mut it = Searcher::new(input).into_captures_iter(caps, |input, caps| { +/// Ok(re.try_search(&mut cache, input, caps)?) +/// }).infallible(); +/// let caps0 = it.next().unwrap(); +/// assert_eq!(Some(Span::from(1..2)), caps0.get_group(1)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct DFA { + /// The configuration provided by the caller. + config: Config, + /// The NFA used to build this DFA. + /// + /// NOTE: We probably don't need to store the NFA here, but we use enough + /// bits from it that it's convenient to do so. And there really isn't much + /// cost to doing so either, since an NFA is reference counted internally. + nfa: NFA, + /// The transition table. Given a state ID 's' and a byte of haystack 'b', + /// the next state is `table[sid + classes[byte]]`. + /// + /// The stride of this table (i.e., the number of columns) is always + /// a power of 2, even if the alphabet length is smaller. This makes + /// converting between state IDs and state indices very cheap. + /// + /// Note that the stride always includes room for one extra "transition" + /// that isn't actually a transition. It is a 'PatternEpsilons' that is + /// used for match states only. Because of this, the maximum number of + /// active columns in the transition table is 257, which means the maximum + /// stride is 512 (the next power of 2 greater than or equal to 257). + table: Vec<Transition>, + /// The DFA state IDs of the starting states. + /// + /// `starts[0]` is always present and corresponds to the starting state + /// when searching for matches of any pattern in the DFA. + /// + /// `starts[i]` where i>0 corresponds to the starting state for the pattern + /// ID 'i-1'. These starting states are optional. + starts: Vec<StateID>, + /// Every state ID >= this value corresponds to a match state. + /// + /// This is what a search uses to detect whether a state is a match state + /// or not. It requires only a simple comparison instead of bit-unpacking + /// the PatternEpsilons from every state. + min_match_id: StateID, + /// The alphabet of this DFA, split into equivalence classes. Bytes in the + /// same equivalence class can never discriminate between a match and a + /// non-match. + classes: ByteClasses, + /// The number of elements in each state in the transition table. This may + /// be less than the stride, since the stride is always a power of 2 and + /// the alphabet length can be anything up to and including 256. + alphabet_len: usize, + /// The number of columns in the transition table, expressed as a power of + /// 2. + stride2: usize, + /// The offset at which the PatternEpsilons for a match state is stored in + /// the transition table. + /// + /// PERF: One wonders whether it would be better to put this in a separate + /// allocation, since only match states have a non-empty PatternEpsilons + /// and the number of match states tends be dwarfed by the number of + /// non-match states. So this would save '8*len(non_match_states)' for each + /// DFA. The question is whether moving this to a different allocation will + /// lead to a perf hit during searches. You might think dealing with match + /// states is rare, but some regexes spend a lot of time in match states + /// gobbling up input. But... match state handling is already somewhat + /// expensive, so maybe this wouldn't do much? Either way, it's worth + /// experimenting. + pateps_offset: usize, + /// The first explicit slot index. This refers to the first slot appearing + /// immediately after the last implicit slot. It is always 'patterns.len() + /// * 2'. + /// + /// We record this because we only store the explicit slots in our DFA + /// transition table that need to be saved. Implicit slots are handled + /// automatically as part of the search. + explicit_slot_start: usize, +} + +impl DFA { + /// Parse the given regular expression using the default configuration and + /// return the corresponding one-pass DFA. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re = DFA::new("foo[0-9]+bar")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "foo12345barzzz", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..11)), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + #[inline] + pub fn new(pattern: &str) -> Result<DFA, BuildError> { + DFA::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "abc123", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..3)), caps.get_match()); + /// + /// re.captures(&mut cache, "123abc", &mut caps); + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + #[inline] + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { + DFA::builder().build_many(patterns) + } + + /// Like `new`, but builds a one-pass DFA directly from an NFA. This is + /// useful if you already have an NFA, or even if you hand-assembled the + /// NFA. + /// + /// # Example + /// + /// This shows how to hand assemble a regular expression via its HIR, + /// compile an NFA from it and build a one-pass DFA from the NFA. + /// + /// ``` + /// use regex_automata::{ + /// dfa::onepass::DFA, + /// nfa::thompson::NFA, + /// Match, + /// }; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = DFA::new_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let expected = Some(Match::must(0, 0..1)); + /// re.captures(&mut cache, "A", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_from_nfa(nfa: NFA) -> Result<DFA, BuildError> { + DFA::builder().build_from_nfa(nfa) + } + + /// Create a new one-pass DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let dfa = DFA::always_match()?; + /// let mut cache = dfa.create_cache(); + /// let mut caps = dfa.create_captures(); + /// + /// let expected = Match::must(0, 0..0); + /// dfa.captures(&mut cache, "", &mut caps); + /// assert_eq!(Some(expected), caps.get_match()); + /// dfa.captures(&mut cache, "foo", &mut caps); + /// assert_eq!(Some(expected), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::always_match(); + Builder::new().build_from_nfa(nfa) + } + + /// Create a new one-pass DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::onepass::DFA; + /// + /// let dfa = DFA::never_match()?; + /// let mut cache = dfa.create_cache(); + /// let mut caps = dfa.create_captures(); + /// + /// dfa.captures(&mut cache, "", &mut caps); + /// assert_eq!(None, caps.get_match()); + /// dfa.captures(&mut cache, "foo", &mut caps); + /// assert_eq!(None, caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::never_match(); + Builder::new().build_from_nfa(nfa) + } + + /// Return a default configuration for a DFA. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a DFA. + /// + /// # Example + /// + /// This example shows how to change the match semantics of this DFA from + /// its default "leftmost first" to "all." When using "all," non-greediness + /// doesn't apply and neither does preference order matching. Instead, the + /// longest match possible is always returned. (Although, by construction, + /// it's impossible for a one-pass DFA to have a different answer for + /// "preference order" vs "longest match.") + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match, MatchKind}; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build(r"(abc)+?")?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// + /// re.captures(&mut cache, "abcabc", &mut caps); + /// // Normally, the non-greedy repetition would give us a 0..3 match. + /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a DFA. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// dfa::onepass::DFA, + /// nfa::thompson, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = DFA::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 0..8)); + /// re.captures(&mut cache, haystack, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new empty set of capturing groups that is guaranteed to be + /// valid for the search APIs on this DFA. + /// + /// A `Captures` value created for a specific DFA cannot be used with any + /// other DFA. + /// + /// This is a convenience function for [`Captures::all`]. See the + /// [`Captures`] documentation for an explanation of its alternative + /// constructors that permit the DFA to do less work during a search, and + /// thus might make it faster. + #[inline] + pub fn create_captures(&self) -> Captures { + Captures::all(self.nfa.group_info().clone()) + } + + /// Create a new cache for this DFA. + /// + /// The cache returned should only be used for searches for this + /// DFA. If you want to reuse the cache for another DFA, then you + /// must call [`Cache::reset`] with that DFA (or, equivalently, + /// [`DFA::reset_cache`]). + #[inline] + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this DFA (and only this DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different DFA. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different DFA. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re1 = DFA::new(r"\w")?; + /// let re2 = DFA::new(r"\W")?; + /// let mut caps1 = re1.create_captures(); + /// let mut caps2 = re2.create_captures(); + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() }, + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the one-pass DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() }, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn reset_cache(&self, cache: &mut Cache) { + cache.reset(self); + } + + /// Return the config for this one-pass DFA. + #[inline] + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Returns a reference to the underlying NFA. + #[inline] + pub fn get_nfa(&self) -> &NFA { + &self.nfa + } + + /// Returns the total number of patterns compiled into this DFA. + /// + /// In the case of a DFA that contains no patterns, this returns `0`. + #[inline] + pub fn pattern_len(&self) -> usize { + self.get_nfa().pattern_len() + } + + /// Returns the total number of states in this one-pass DFA. + /// + /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose + /// a low level DFA API. Therefore, this routine has little use other than + /// being informational. + #[inline] + pub fn state_len(&self) -> usize { + self.table.len() >> self.stride2() + } + + /// Returns the total number of elements in the alphabet for this DFA. + /// + /// That is, this returns the total number of transitions that each + /// state in this DFA must have. The maximum alphabet size is 256, which + /// corresponds to each possible byte value. + /// + /// The alphabet size may be less than 256 though, and unless + /// [`Config::byte_classes`] is disabled, it is typically must less than + /// 256. Namely, bytes are grouped into equivalence classes such that no + /// two bytes in the same class can distinguish a match from a non-match. + /// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could + /// all be in the same equivalence class. This leads to a massive space + /// savings. + /// + /// Note though that the alphabet length does _not_ necessarily equal the + /// total stride space taken up by a single DFA state in the transition + /// table. Namely, for performance reasons, the stride is always the + /// smallest power of two that is greater than or equal to the alphabet + /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are + /// often more useful. The alphabet length is typically useful only for + /// informational purposes. + /// + /// Note also that unlike dense or sparse DFAs, a one-pass DFA does + /// not have a special end-of-input (EOI) transition. This is because + /// a one-pass DFA handles look-around assertions explicitly (like the + /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build + /// them into the transitions of the DFA. + #[inline] + pub fn alphabet_len(&self) -> usize { + self.alphabet_len + } + + /// Returns the total stride for every state in this DFA, expressed as the + /// exponent of a power of 2. The stride is the amount of space each state + /// takes up in the transition table, expressed as a number of transitions. + /// (Unused transitions map to dead states.) + /// + /// The stride of a DFA is always equivalent to the smallest power of + /// 2 that is greater than or equal to the DFA's alphabet length. This + /// definition uses extra space, but possibly permits faster translation + /// between state identifiers and their corresponding offsets in this DFA's + /// transition table. + /// + /// For example, if the DFA's stride is 16 transitions, then its `stride2` + /// is `4` since `2^4 = 16`. + /// + /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) + /// while the maximum `stride2` value is `9` (corresponding to a stride + /// of `512`). The maximum in theory should be `8`, but because of some + /// implementation quirks that may be relaxed in the future, it is one more + /// than `8`. (Do note that a maximal stride is incredibly rare, as it + /// would imply that there is almost no redundant in the regex pattern.) + /// + /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose + /// a low level DFA API. Therefore, this routine has little use other than + /// being informational. + #[inline] + pub fn stride2(&self) -> usize { + self.stride2 + } + + /// Returns the total stride for every state in this DFA. This corresponds + /// to the total number of transitions used by each state in this DFA's + /// transition table. + /// + /// Please see [`DFA::stride2`] for more information. In particular, this + /// returns the stride as the number of transitions, where as `stride2` + /// returns it as the exponent of a power of 2. + /// + /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose + /// a low level DFA API. Therefore, this routine has little use other than + /// being informational. + #[inline] + pub fn stride(&self) -> usize { + 1 << self.stride2() + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::<onepass::DFA>()`. + #[inline] + pub fn memory_usage(&self) -> usize { + use core::mem::size_of; + + self.table.len() * size_of::<Transition>() + + self.starts.len() * size_of::<StateID>() + } +} + +impl DFA { + /// Executes an anchored leftmost forward search, and returns true if and + /// only if this one-pass DFA matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future + /// input will never lead to a different result. In particular, if the + /// underlying DFA enters a match state, then this routine will return + /// `true` immediately without inspecting any future input. (Consider how + /// this might make a difference given the regex `a+` on the haystack + /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, + /// but routines like `find` need to continue searching because `+` is + /// greedy by default.) + /// + /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the + /// given configuration was [`Anchored::No`] (which is the default). + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`DFA::try_search`] if you want to handle these panics as error + /// values instead. + /// + /// # Example + /// + /// This shows basic usage: + /// + /// ``` + /// use regex_automata::dfa::onepass::DFA; + /// + /// let re = DFA::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "foo12345bar")); + /// assert!(!re.is_match(&mut cache, "foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: consistency with search APIs + /// + /// `is_match` is guaranteed to return `true` whenever `captures` returns + /// a match. This includes searches that are executed entirely within a + /// codepoint: + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Input}; + /// + /// let re = DFA::new("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Notice that when UTF-8 mode is disabled, then the above reports a + /// match because the restriction against zero-width matches that split a + /// codepoint has been lifted: + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input}; + /// + /// let re = DFA::builder() + /// .thompson(NFA::config().utf8(false)) + /// .build("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> bool { + let mut input = input.into().earliest(true); + if matches!(input.get_anchored(), Anchored::No) { + input.set_anchored(Anchored::Yes); + } + self.try_search_slots(cache, &input, &mut []).unwrap().is_some() + } + + /// Executes an anchored leftmost forward search, and returns a `Match` if + /// and only if this one-pass DFA matches the given haystack. + /// + /// This routine only includes the overall match span. To get access to the + /// individual spans of each capturing group, use [`DFA::captures`]. + /// + /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the + /// given configuration was [`Anchored::No`] (which is the default). + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`DFA::try_search`] if you want to handle these panics as error + /// values instead. + /// + /// # Example + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re = DFA::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..8); + /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over later parts. + /// let re = DFA::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..3); + /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Option<Match> { + let mut input = input.into(); + if matches!(input.get_anchored(), Anchored::No) { + input.set_anchored(Anchored::Yes); + } + if self.get_nfa().pattern_len() == 1 { + let mut slots = [None, None]; + let pid = + self.try_search_slots(cache, &input, &mut slots).unwrap()?; + let start = slots[0].unwrap().get(); + let end = slots[1].unwrap().get(); + return Some(Match::new(pid, Span { start, end })); + } + let ginfo = self.get_nfa().group_info(); + let slots_len = ginfo.implicit_slot_len(); + let mut slots = vec![None; slots_len]; + let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?; + let start = slots[pid.as_usize() * 2].unwrap().get(); + let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + Some(Match::new(pid, Span { start, end })) + } + + /// Executes an anchored leftmost forward search and writes the spans + /// of capturing groups that participated in a match into the provided + /// [`Captures`] value. If no match was found, then [`Captures::is_match`] + /// is guaranteed to return `false`. + /// + /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the + /// given configuration was [`Anchored::No`] (which is the default). + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`DFA::try_search`] if you want to handle these panics as error + /// values instead. + /// + /// # Example + /// + /// This shows a simple example of a one-pass regex that extracts + /// capturing group spans. + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match, Span}; + /// + /// let re = DFA::new( + /// // Notice that we use ASCII here. The corresponding Unicode regex + /// // is sadly not one-pass. + /// "(?P<first>[[:alpha:]]+)[[:space:]]+(?P<last>[[:alpha:]]+)", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); + /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); + /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + caps: &mut Captures, + ) { + let mut input = input.into(); + if matches!(input.get_anchored(), Anchored::No) { + input.set_anchored(Anchored::Yes); + } + self.try_search(cache, &input, caps).unwrap(); + } + + /// Executes an anchored leftmost forward search and writes the spans + /// of capturing groups that participated in a match into the provided + /// [`Captures`] value. If no match was found, then [`Captures::is_match`] + /// is guaranteed to return `false`. + /// + /// The differences with [`DFA::captures`] are: + /// + /// 1. This returns an error instead of panicking if the search fails. + /// 2. Accepts an `&Input` instead of a `Into<Input>`. This permits reusing + /// the same input for multiple searches, which _may_ be important for + /// latency. + /// 3. This does not automatically change the [`Anchored`] mode from `No` + /// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an + /// error is returned. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-regex that permits searching + /// for specific patterns. Note that this is somewhat less useful than + /// in other regex engines, since a one-pass DFA by definition has no + /// ambiguity about which pattern can match at a position. That is, if it + /// were possible for two different patterns to match at the same starting + /// position, then the multi-regex would not be one-pass and construction + /// would have failed. + /// + /// Nevertheless, this can still be useful if you only care about matches + /// for a specific pattern, and want the DFA to report "no match" even if + /// some other pattern would have matched. + /// + /// Note that in order to make use of this functionality, + /// [`Config::starts_for_each_pattern`] must be enabled. It is disabled + /// by default since it may result in higher memory usage. + /// + /// ``` + /// use regex_automata::{ + /// dfa::onepass::DFA, Anchored, Input, Match, PatternID, + /// }; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&["[a-z]+", "[0-9]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "123abc"; + /// let input = Input::new(haystack).anchored(Anchored::Yes); + /// + /// // A normal multi-pattern search will show pattern 1 matches. + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); + /// + /// // If we only want to report pattern 0 matches, then we'll get no + /// // match here. + /// let input = input.anchored(Anchored::Pattern(PatternID::must(0))); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match}; + /// + /// // one-pass DFAs fully support Unicode word boundaries! + /// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass. + /// // :-( + /// let re = DFA::new(r"\b[0-9]{3}\b")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// let input = Input::new(&haystack[3..6]).anchored(Anchored::Yes); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let input = Input::new(haystack).range(3..6).anchored(Anchored::Yes); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search( + &self, + cache: &mut Cache, + input: &Input<'_>, + caps: &mut Captures, + ) -> Result<(), MatchError> { + let pid = self.try_search_slots(cache, input, caps.slots_mut())?; + caps.set_pattern(pid); + Ok(()) + } + + /// Executes an anchored leftmost forward search and writes the spans + /// of capturing groups that participated in a match into the provided + /// `slots`, and returns the matching pattern ID. The contents of the + /// slots for patterns other than the matching pattern are unspecified. If + /// no match was found, then `None` is returned and the contents of all + /// `slots` is unspecified. + /// + /// This is like [`DFA::try_search`], but it accepts a raw slots slice + /// instead of a `Captures` value. This is useful in contexts where you + /// don't want or need to allocate a `Captures`. + /// + /// It is legal to pass _any_ number of slots to this routine. If the regex + /// engine would otherwise write a slot offset that doesn't fit in the + /// provided slice, then it is simply skipped. In general though, there are + /// usually three slice lengths you might want to use: + /// + /// * An empty slice, if you only care about which pattern matched. + /// * A slice with + /// [`pattern_len() * 2`](crate::dfa::onepass::DFA::pattern_len) + /// slots, if you only care about the overall match spans for each matching + /// pattern. + /// * A slice with + /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which + /// permits recording match offsets for every capturing group in every + /// pattern. + /// + /// # Errors + /// + /// This routine errors if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID}; + /// + /// let re = DFA::new_many(&[ + /// r"[a-zA-Z]+", + /// r"[0-9]+", + /// ])?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("123").anchored(Anchored::Yes); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?; + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(0), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(3), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<PatternID>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + if !utf8empty { + return self.try_search_slots_imp(cache, input, slots); + } + // See PikeVM::try_search_slots for why we do this. + let min = self.get_nfa().group_info().implicit_slot_len(); + if slots.len() >= min { + return self.try_search_slots_imp(cache, input, slots); + } + if self.get_nfa().pattern_len() == 1 { + let mut enough = [None, None]; + let got = self.try_search_slots_imp(cache, input, &mut enough)?; + // This is OK because we know `enough_slots` is strictly bigger + // than `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + return Ok(got); + } + let mut enough = vec![None; min]; + let got = self.try_search_slots_imp(cache, input, &mut enough)?; + // This is OK because we know `enough_slots` is strictly bigger than + // `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + Ok(got) + } + + #[inline(never)] + fn try_search_slots_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<PatternID>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + match self.search_imp(cache, input, slots)? { + None => return Ok(None), + Some(pid) if !utf8empty => return Ok(Some(pid)), + Some(pid) => { + // These slot indices are always correct because we know our + // 'pid' is valid and thus we know that the slot indices for it + // are valid. + let slot_start = pid.as_usize().wrapping_mul(2); + let slot_end = slot_start.wrapping_add(1); + // OK because we know we have a match and we know our caller + // provided slots are big enough (which we make true above if + // the caller didn't). Namely, we're only here when 'utf8empty' + // is true, and when that's true, we require slots for every + // pattern. + let start = slots[slot_start].unwrap().get(); + let end = slots[slot_end].unwrap().get(); + // If our match splits a codepoint, then we cannot report is + // as a match. And since one-pass DFAs only support anchored + // searches, we don't try to skip ahead to find the next match. + // We can just quit with nothing. + if start == end && !input.is_char_boundary(start) { + return Ok(None); + } + Ok(Some(pid)) + } + } + } +} + +impl DFA { + fn search_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<PatternID>, MatchError> { + // PERF: Some ideas. I ran out of steam after my initial impl to try + // many of these. + // + // 1) Try doing more state shuffling. Right now, all we do is push + // match states to the end of the transition table so that we can do + // 'if sid >= self.min_match_id' to know whether we're in a match + // state or not. But what about doing something like dense DFAs and + // pushing dead, match and states with captures/looks all toward the + // beginning of the transition table. Then we could do 'if sid <= + // self.max_special_id', in which case, we need to do some special + // handling of some sort. Otherwise, we get the happy path, just + // like in a DFA search. The main argument against this is that the + // one-pass DFA is likely to be used most often with capturing groups + // and if capturing groups are common, then this might wind up being a + // pessimization. + // + // 2) Consider moving 'PatternEpsilons' out of the transition table. + // It is only needed for match states and usually a small minority of + // states are match states. Therefore, we're using an extra 'u64' for + // most states. + // + // 3) I played around with the match state handling and it seems like + // there is probably a lot left on the table for improvement. The + // key tension is that the 'find_match' routine is a giant mess, but + // splitting it out into a non-inlineable function is a non-starter + // because the match state might consume input, so 'find_match' COULD + // be called quite a lot, and a function call at that point would trash + // perf. In theory, we could detect whether a match state consumes + // input and then specialize our search routine based on that. In that + // case, maybe an extra function call is OK, but even then, it might be + // too much of a latency hit. Another idea is to just try and figure + // out how to reduce the code size of 'find_match'. RE2 has a trick + // here where the match handling isn't done if we know the next byte of + // input yields a match too. Maybe we adopt that? + // + // This just might be a tricky DFA to optimize. + + if input.is_done() { + return Ok(None); + } + // We unfortunately have a bit of book-keeping to do to set things + // up. We do have to setup our cache and clear all of our slots. In + // particular, clearing the slots is necessary for the case where we + // report a match, but one of the capturing groups didn't participate + // in the match but had a span set from a previous search. That would + // be bad. In theory, we could avoid all this slot clearing if we knew + // that every slot was always activated for every match. Then we would + // know they would always be overwritten when a match is found. + let explicit_slots_len = core::cmp::min( + Slots::LIMIT, + slots.len().saturating_sub(self.explicit_slot_start), + ); + cache.setup_search(explicit_slots_len); + for slot in cache.explicit_slots() { + *slot = None; + } + for slot in slots.iter_mut() { + *slot = None; + } + // We set the starting slots for every pattern up front. This does + // increase our latency somewhat, but it avoids having to do it every + // time we see a match state (which could be many times in a single + // search if the match state consumes input). + for pid in self.nfa.patterns() { + let i = pid.as_usize() * 2; + if i >= slots.len() { + break; + } + slots[i] = NonMaxUsize::new(input.start()); + } + let mut pid = None; + let mut next_sid = match input.get_anchored() { + Anchored::Yes => self.start(), + Anchored::Pattern(pid) => self.start_pattern(pid)?, + Anchored::No => { + // If the regex is itself always anchored, then we're fine, + // even if the search is configured to be unanchored. + if !self.nfa.is_always_start_anchored() { + return Err(MatchError::unsupported_anchored( + Anchored::No, + )); + } + self.start() + } + }; + let leftmost_first = + matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst); + for at in input.start()..input.end() { + let sid = next_sid; + let trans = self.transition(sid, input.haystack()[at]); + next_sid = trans.state_id(); + let epsilons = trans.epsilons(); + if sid >= self.min_match_id { + if self.find_match(cache, input, at, sid, slots, &mut pid) { + if input.get_earliest() + || (leftmost_first && trans.match_wins()) + { + return Ok(pid); + } + } + } + if sid == DEAD + || (!epsilons.looks().is_empty() + && !self.nfa.look_matcher().matches_set_inline( + epsilons.looks(), + input.haystack(), + at, + )) + { + return Ok(pid); + } + epsilons.slots().apply(at, cache.explicit_slots()); + } + if next_sid >= self.min_match_id { + self.find_match( + cache, + input, + input.end(), + next_sid, + slots, + &mut pid, + ); + } + Ok(pid) + } + + /// Assumes 'sid' is a match state and looks for whether a match can + /// be reported. If so, appropriate offsets are written to 'slots' and + /// 'matched_pid' is set to the matching pattern ID. + /// + /// Even when 'sid' is a match state, it's possible that a match won't + /// be reported. For example, when the conditional epsilon transitions + /// leading to the match state aren't satisfied at the given position in + /// the haystack. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_match( + &self, + cache: &mut Cache, + input: &Input<'_>, + at: usize, + sid: StateID, + slots: &mut [Option<NonMaxUsize>], + matched_pid: &mut Option<PatternID>, + ) -> bool { + debug_assert!(sid >= self.min_match_id); + let pateps = self.pattern_epsilons(sid); + let epsilons = pateps.epsilons(); + if !epsilons.looks().is_empty() + && !self.nfa.look_matcher().matches_set_inline( + epsilons.looks(), + input.haystack(), + at, + ) + { + return false; + } + let pid = pateps.pattern_id_unchecked(); + // This calculation is always correct because we know our 'pid' is + // valid and thus we know that the slot indices for it are valid. + let slot_end = pid.as_usize().wrapping_mul(2).wrapping_add(1); + // Set the implicit 'end' slot for the matching pattern. (The 'start' + // slot was set at the beginning of the search.) + if slot_end < slots.len() { + slots[slot_end] = NonMaxUsize::new(at); + } + // If the caller provided enough room, copy the previously recorded + // explicit slots from our scratch space to the caller provided slots. + // We *also* need to set any explicit slots that are active as part of + // the path to the match state. + if self.explicit_slot_start < slots.len() { + // NOTE: The 'cache.explicit_slots()' slice is setup at the + // beginning of every search such that it is guaranteed to return a + // slice of length equivalent to 'slots[explicit_slot_start..]'. + slots[self.explicit_slot_start..] + .copy_from_slice(cache.explicit_slots()); + epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]); + } + *matched_pid = Some(pid); + true + } +} + +impl DFA { + /// Returns the anchored start state for matching any pattern in this DFA. + fn start(&self) -> StateID { + self.starts[0] + } + + /// Returns the anchored start state for matching the given pattern. If + /// 'starts_for_each_pattern' + /// was not enabled, then this returns an error. If the given pattern is + /// not in this DFA, then `Ok(None)` is returned. + fn start_pattern(&self, pid: PatternID) -> Result<StateID, MatchError> { + if !self.config.get_starts_for_each_pattern() { + return Err(MatchError::unsupported_anchored(Anchored::Pattern( + pid, + ))); + } + // 'starts' always has non-zero length. The first entry is always the + // anchored starting state for all patterns, and the following entries + // are optional and correspond to the anchored starting states for + // patterns at pid+1. Thus, starts.len()-1 corresponds to the total + // number of patterns that one can explicitly search for. (And it may + // be zero.) + Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD)) + } + + /// Returns the transition from the given state ID and byte of input. The + /// transition includes the next state ID, the slots that should be saved + /// and any conditional epsilon transitions that must be satisfied in order + /// to take this transition. + fn transition(&self, sid: StateID, byte: u8) -> Transition { + let offset = sid.as_usize() << self.stride2(); + let class = self.classes.get(byte).as_usize(); + self.table[offset + class] + } + + /// Set the transition from the given state ID and byte of input to the + /// transition given. + fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) { + let offset = sid.as_usize() << self.stride2(); + let class = self.classes.get(byte).as_usize(); + self.table[offset + class] = to; + } + + /// Return an iterator of "sparse" transitions for the given state ID. + /// "sparse" in this context means that consecutive transitions that are + /// equivalent are returned as one group, and transitions to the DEAD state + /// are ignored. + /// + /// This winds up being useful for debug printing, since it's much terser + /// to display runs of equivalent transitions than the transition for every + /// possible byte value. Indeed, in practice, it's very common for runs + /// of equivalent transitions to appear. + fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> { + let start = sid.as_usize() << self.stride2(); + let end = start + self.alphabet_len(); + SparseTransitionIter { + it: self.table[start..end].iter().enumerate(), + cur: None, + } + } + + /// Return the pattern epsilons for the given state ID. + /// + /// If the given state ID does not correspond to a match state ID, then the + /// pattern epsilons returned is empty. + fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons { + let offset = sid.as_usize() << self.stride2(); + PatternEpsilons(self.table[offset + self.pateps_offset].0) + } + + /// Set the pattern epsilons for the given state ID. + fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) { + let offset = sid.as_usize() << self.stride2(); + self.table[offset + self.pateps_offset] = Transition(pateps.0); + } + + /// Returns the state ID prior to the one given. This returns None if the + /// given ID is the first DFA state. + fn prev_state_id(&self, id: StateID) -> Option<StateID> { + if id == DEAD { + None + } else { + // CORRECTNESS: Since 'id' is not the first state, subtracting 1 + // is always valid. + Some(StateID::new_unchecked(id.as_usize().checked_sub(1).unwrap())) + } + } + + /// Returns the state ID of the last state in this DFA's transition table. + /// "last" in this context means the last state to appear in memory, i.e., + /// the one with the greatest ID. + fn last_state_id(&self) -> StateID { + // CORRECTNESS: A DFA table is always non-empty since it always at + // least contains a DEAD state. Since every state has the same stride, + // we can just compute what the "next" state ID would have been and + // then subtract 1 from it. + StateID::new_unchecked( + (self.table.len() >> self.stride2()).checked_sub(1).unwrap(), + ) + } + + /// Move the transitions from 'id1' to 'id2' and vice versa. + /// + /// WARNING: This does not update the rest of the transition table to have + /// transitions to 'id1' changed to 'id2' and vice versa. This merely moves + /// the states in memory. + pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) { + let o1 = id1.as_usize() << self.stride2(); + let o2 = id2.as_usize() << self.stride2(); + for b in 0..self.stride() { + self.table.swap(o1 + b, o2 + b); + } + } + + /// Map all state IDs in this DFA (transition table + start states) + /// according to the closure given. + pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + for i in 0..self.state_len() { + let offset = i << self.stride2(); + for b in 0..self.alphabet_len() { + let next = self.table[offset + b].state_id(); + self.table[offset + b].set_state_id(map(next)); + } + } + for i in 0..self.starts.len() { + self.starts[i] = map(self.starts[i]); + } + } +} + +impl core::fmt::Debug for DFA { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + fn debug_state_transitions( + f: &mut core::fmt::Formatter, + dfa: &DFA, + sid: StateID, + ) -> core::fmt::Result { + for (i, (start, end, trans)) in + dfa.sparse_transitions(sid).enumerate() + { + let next = trans.state_id(); + if i > 0 { + write!(f, ", ")?; + } + if start == end { + write!( + f, + "{:?} => {:?}", + DebugByte(start), + next.as_usize(), + )?; + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize(), + )?; + } + if trans.match_wins() { + write!(f, " (MW)")?; + } + if !trans.epsilons().is_empty() { + write!(f, " ({:?})", trans.epsilons())?; + } + } + Ok(()) + } + + writeln!(f, "onepass::DFA(")?; + for index in 0..self.state_len() { + let sid = StateID::must(index); + let pateps = self.pattern_epsilons(sid); + if sid == DEAD { + write!(f, "D ")?; + } else if pateps.pattern_id().is_some() { + write!(f, "* ")?; + } else { + write!(f, " ")?; + } + write!(f, "{:06?}", sid.as_usize())?; + if !pateps.is_empty() { + write!(f, " ({:?})", pateps)?; + } + write!(f, ": ")?; + debug_state_transitions(f, self, sid)?; + write!(f, "\n")?; + } + writeln!(f, "")?; + for (i, &sid) in self.starts.iter().enumerate() { + if i == 0 { + writeln!(f, "START(ALL): {:?}", sid.as_usize())?; + } else { + writeln!( + f, + "START(pattern: {:?}): {:?}", + i - 1, + sid.as_usize(), + )?; + } + } + writeln!(f, "state length: {:?}", self.state_len())?; + writeln!(f, "pattern length: {:?}", self.pattern_len())?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// An iterator over groups of consecutive equivalent transitions in a single +/// state. +#[derive(Debug)] +struct SparseTransitionIter<'a> { + it: core::iter::Enumerate<core::slice::Iter<'a, Transition>>, + cur: Option<(u8, u8, Transition)>, +} + +impl<'a> Iterator for SparseTransitionIter<'a> { + type Item = (u8, u8, Transition); + + fn next(&mut self) -> Option<(u8, u8, Transition)> { + while let Some((b, &trans)) = self.it.next() { + // Fine because we'll never have more than u8::MAX transitions in + // one state. + let b = b.as_u8(); + let (prev_start, prev_end, prev_trans) = match self.cur { + Some(t) => t, + None => { + self.cur = Some((b, b, trans)); + continue; + } + }; + if prev_trans == trans { + self.cur = Some((prev_start, b, prev_trans)); + } else { + self.cur = Some((b, b, trans)); + if prev_trans.state_id() != DEAD { + return Some((prev_start, prev_end, prev_trans)); + } + } + } + if let Some((start, end, trans)) = self.cur.take() { + if trans.state_id() != DEAD { + return Some((start, end, trans)); + } + } + None + } +} + +/// A cache represents mutable state that a one-pass [`DFA`] requires during a +/// search. +/// +/// For a given one-pass DFA, its corresponding cache may be created either via +/// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every +/// way, except the former does not require explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the one-pass DFA from which it was +/// created. It may only be used with that one-pass DFA. A cache and its +/// allocations may be re-purposed via [`Cache::reset`], in which case, it can +/// only be used with the new one-pass DFA (and not the old one). +#[derive(Clone, Debug)] +pub struct Cache { + /// Scratch space used to store slots during a search. Basically, we use + /// the caller provided slots to store slots known when a match occurs. + /// But after a match occurs, we might continue a search but ultimately + /// fail to extend the match. When continuing the search, we need some + /// place to store candidate capture offsets without overwriting the slot + /// offsets recorded for the most recently seen match. + explicit_slots: Vec<Option<NonMaxUsize>>, + /// The number of slots in the caller-provided 'Captures' value for the + /// current search. This is always at most 'explicit_slots.len()', but + /// might be less than it, if the caller provided fewer slots to fill. + explicit_slot_len: usize, +} + +impl Cache { + /// Create a new [`onepass::DFA`](DFA) cache. + /// + /// A potentially more convenient routine to create a cache is + /// [`DFA::create_cache`], as it does not require also importing the + /// `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other one-pass DFA, + /// then you must call [`Cache::reset`] with the desired one-pass DFA. + pub fn new(re: &DFA) -> Cache { + let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: 0 }; + cache.reset(re); + cache + } + + /// Reset this cache such that it can be used for searching with a + /// different [`onepass::DFA`](DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different one-pass DFA. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different one-pass + /// DFA. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re1 = DFA::new(r"\w")?; + /// let re2 = DFA::new(r"\W")?; + /// let mut caps1 = re1.create_captures(); + /// let mut caps2 = re2.create_captures(); + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() }, + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the one-pass DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() }, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &DFA) { + let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len(); + self.explicit_slots.resize(explicit_slot_len, None); + self.explicit_slot_len = explicit_slot_len; + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.explicit_slots.len() * core::mem::size_of::<Option<NonMaxUsize>>() + } + + fn explicit_slots(&mut self) -> &mut [Option<NonMaxUsize>] { + &mut self.explicit_slots[..self.explicit_slot_len] + } + + fn setup_search(&mut self, explicit_slot_len: usize) { + self.explicit_slot_len = explicit_slot_len; + } +} + +/// Represents a single transition in a one-pass DFA. +/// +/// The high 24 bits corresponds to the state ID. The low 48 bits corresponds +/// to the transition epsilons, which contains the slots that should be saved +/// when this transition is followed and the conditional epsilon transitions +/// that must be satisfied in order to follow this transition. +#[derive(Clone, Copy, Eq, PartialEq)] +struct Transition(u64); + +impl Transition { + const STATE_ID_BITS: u64 = 21; + const STATE_ID_SHIFT: u64 = 64 - Transition::STATE_ID_BITS; + const STATE_ID_LIMIT: u64 = 1 << Transition::STATE_ID_BITS; + const MATCH_WINS_SHIFT: u64 = 64 - (Transition::STATE_ID_BITS + 1); + const INFO_MASK: u64 = 0x000003FF_FFFFFFFF; + + /// Return a new transition to the given state ID with the given epsilons. + fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition { + let match_wins = + if match_wins { 1 << Transition::MATCH_WINS_SHIFT } else { 0 }; + let sid = sid.as_u64() << Transition::STATE_ID_SHIFT; + Transition(sid | match_wins | epsilons.0) + } + + /// Returns true if and only if this transition points to the DEAD state. + fn is_dead(self) -> bool { + self.state_id() == DEAD + } + + /// Return whether this transition has a "match wins" property. + /// + /// When a transition has this property, it means that if a match has been + /// found and the search uses leftmost-first semantics, then that match + /// should be returned immediately instead of continuing on. + /// + /// The "match wins" name comes from RE2, which uses a pretty much + /// identical mechanism for implementing leftmost-first semantics. + fn match_wins(&self) -> bool { + (self.0 >> Transition::MATCH_WINS_SHIFT & 1) == 1 + } + + /// Return the "next" state ID that this transition points to. + fn state_id(&self) -> StateID { + // OK because a Transition has a valid StateID in its upper bits by + // construction. The cast to usize is also correct, even on 16-bit + // targets because, again, we know the upper bits is a valid StateID, + // which can never overflow usize on any supported target. + StateID::new_unchecked( + (self.0 >> Transition::STATE_ID_SHIFT).as_usize(), + ) + } + + /// Set the "next" state ID in this transition. + fn set_state_id(&mut self, sid: StateID) { + *self = Transition::new(self.match_wins(), sid, self.epsilons()); + } + + /// Return the epsilons embedded in this transition. + fn epsilons(&self) -> Epsilons { + Epsilons(self.0 & Transition::INFO_MASK) + } +} + +impl core::fmt::Debug for Transition { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_dead() { + return write!(f, "0"); + } + write!(f, "{}", self.state_id().as_usize())?; + if self.match_wins() { + write!(f, "-MW")?; + } + if !self.epsilons().is_empty() { + write!(f, "-{:?}", self.epsilons())?; + } + Ok(()) + } +} + +/// A representation of a match state's pattern ID along with the epsilons for +/// when a match occurs. +/// +/// A match state in a one-pass DFA, unlike in a more general DFA, has exactly +/// one pattern ID. If it had more, then the original NFA would not have been +/// one-pass. +/// +/// The "epsilons" part of this corresponds to what was found in the epsilon +/// transitions between the transition taken in the last byte of input and the +/// ultimate match state. This might include saving slots and/or conditional +/// epsilon transitions that must be satisfied before one can report the match. +/// +/// Technically, every state has room for a 'PatternEpsilons', but it is only +/// ever non-empty for match states. +#[derive(Clone, Copy)] +struct PatternEpsilons(u64); + +impl PatternEpsilons { + const PATTERN_ID_BITS: u64 = 22; + const PATTERN_ID_SHIFT: u64 = 64 - PatternEpsilons::PATTERN_ID_BITS; + // A sentinel value indicating that this is not a match state. We don't + // use 0 since 0 is a valid pattern ID. + const PATTERN_ID_NONE: u64 = 0x00000000_003FFFFF; + const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE; + const PATTERN_ID_MASK: u64 = 0xFFFFFC00_00000000; + const EPSILONS_MASK: u64 = 0x000003FF_FFFFFFFF; + + /// Return a new empty pattern epsilons that has no pattern ID and has no + /// epsilons. This is suitable for non-match states. + fn empty() -> PatternEpsilons { + PatternEpsilons( + PatternEpsilons::PATTERN_ID_NONE + << PatternEpsilons::PATTERN_ID_SHIFT, + ) + } + + /// Whether this pattern epsilons is empty or not. It's empty when it has + /// no pattern ID and an empty epsilons. + fn is_empty(self) -> bool { + self.pattern_id().is_none() && self.epsilons().is_empty() + } + + /// Return the pattern ID in this pattern epsilons if one exists. + fn pattern_id(self) -> Option<PatternID> { + let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT; + if pid == PatternEpsilons::PATTERN_ID_LIMIT { + None + } else { + Some(PatternID::new_unchecked(pid.as_usize())) + } + } + + /// Returns the pattern ID without checking whether it's valid. If this is + /// called and there is no pattern ID in this `PatternEpsilons`, then this + /// will likely produce an incorrect result or possibly even a panic or + /// an overflow. But safety will not be violated. + /// + /// This is useful when you know a particular state is a match state. If + /// it's a match state, then it must have a pattern ID. + fn pattern_id_unchecked(self) -> PatternID { + let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT; + PatternID::new_unchecked(pid.as_usize()) + } + + /// Return a new pattern epsilons with the given pattern ID, but the same + /// epsilons. + fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons { + PatternEpsilons( + (pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT) + | (self.0 & PatternEpsilons::EPSILONS_MASK), + ) + } + + /// Return the epsilons part of this pattern epsilons. + fn epsilons(self) -> Epsilons { + Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK) + } + + /// Return a new pattern epsilons with the given epsilons, but the same + /// pattern ID. + fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { + PatternEpsilons( + (self.0 & PatternEpsilons::PATTERN_ID_MASK) + | u64::from(epsilons.0), + ) + } +} + +impl core::fmt::Debug for PatternEpsilons { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.is_empty() { + return write!(f, "N/A"); + } + if let Some(pid) = self.pattern_id() { + write!(f, "{}", pid.as_usize())?; + } + if !self.epsilons().is_empty() { + if self.pattern_id().is_some() { + write!(f, "/")?; + } + write!(f, "{:?}", self.epsilons())?; + } + Ok(()) + } +} + +/// Epsilons represents all of the NFA epsilons transitions that went into a +/// single transition in a single DFA state. In this case, it only represents +/// the epsilon transitions that have some kind of non-consuming side effect: +/// either the transition requires storing the current position of the search +/// into a slot, or the transition is conditional and requires the current +/// position in the input to satisfy an assertion before the transition may be +/// taken. +/// +/// This folds the cumulative effect of a group of NFA states (all connected +/// by epsilon transitions) down into a single set of bits. While these bits +/// can represent all possible conditional epsilon transitions, it only permits +/// storing up to a somewhat small number of slots. +/// +/// Epsilons is represented as a 42-bit integer. For example, it is packed into +/// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a +/// `StateID` and a special "match wins" property.) +#[derive(Clone, Copy)] +struct Epsilons(u64); + +impl Epsilons { + const SLOT_MASK: u64 = 0x000003FF_FFFFFC00; + const SLOT_SHIFT: u64 = 10; + const LOOK_MASK: u64 = 0x00000000_000003FF; + + /// Create a new empty epsilons. It has no slots and no assertions that + /// need to be satisfied. + fn empty() -> Epsilons { + Epsilons(0) + } + + /// Returns true if this epsilons contains no slots and no assertions. + fn is_empty(self) -> bool { + self.0 == 0 + } + + /// Returns the slot epsilon transitions. + fn slots(self) -> Slots { + Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32()) + } + + /// Set the slot epsilon transitions. + fn set_slots(self, slots: Slots) -> Epsilons { + Epsilons( + (u64::from(slots.0) << Epsilons::SLOT_SHIFT) + | (self.0 & Epsilons::LOOK_MASK), + ) + } + + /// Return the set of look-around assertions in these epsilon transitions. + fn looks(self) -> LookSet { + LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u16() } + } + + /// Set the look-around assertions on these epsilon transitions. + fn set_looks(self, look_set: LookSet) -> Epsilons { + Epsilons((self.0 & Epsilons::SLOT_MASK) | u64::from(look_set.bits)) + } +} + +impl core::fmt::Debug for Epsilons { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut wrote = false; + if !self.slots().is_empty() { + write!(f, "{:?}", self.slots())?; + wrote = true; + } + if !self.looks().is_empty() { + if wrote { + write!(f, "/")?; + } + write!(f, "{:?}", self.looks())?; + wrote = true; + } + if !wrote { + write!(f, "N/A")?; + } + Ok(()) + } +} + +/// The set of epsilon transitions indicating that the current position in a +/// search should be saved to a slot. +/// +/// This *only* represents explicit slots. So for example, the pattern +/// `[a-z]+([0-9]+)([a-z]+)` has: +/// +/// * 3 capturing groups, thus 6 slots. +/// * 1 implicit capturing group, thus 2 implicit slots. +/// * 2 explicit capturing groups, thus 4 explicit slots. +/// +/// While implicit slots are represented by epsilon transitions in an NFA, we +/// do not explicitly represent them here. Instead, implicit slots are assumed +/// to be present and handled automatically in the search code. Therefore, +/// that means we only need to represent explicit slots in our epsilon +/// transitions. +/// +/// Its representation is a bit set. The bit 'i' is set if and only if there +/// exists an explicit slot at index 'c', where 'c = (#patterns * 2) + i'. That +/// is, the bit 'i' corresponds to the first explicit slot and the first +/// explicit slot appears immediately following the last implicit slot. (If +/// this is confusing, see `GroupInfo` for more details on how slots works.) +/// +/// A single `Slots` represents all the active slots in a sub-graph of an NFA, +/// where all the states are connected by epsilon transitions. In effect, when +/// traversing the one-pass DFA during a search, all slots set in a particular +/// transition must be captured by recording the current search position. +/// +/// The API of `Slots` requires the caller to handle the explicit slot offset. +/// That is, a `Slots` doesn't know where the explicit slots start for a +/// particular NFA. Thus, if the callers see's the bit 'i' is set, then they +/// need to do the arithmetic above to find 'c', which is the real actual slot +/// index in the corresponding NFA. +#[derive(Clone, Copy)] +struct Slots(u32); + +impl Slots { + const LIMIT: usize = 32; + + /// Insert the slot at the given bit index. + fn insert(self, slot: usize) -> Slots { + debug_assert!(slot < Slots::LIMIT); + Slots(self.0 | (1 << slot.as_u32())) + } + + /// Remove the slot at the given bit index. + fn remove(self, slot: usize) -> Slots { + debug_assert!(slot < Slots::LIMIT); + Slots(self.0 & !(1 << slot.as_u32())) + } + + /// Returns true if and only if this set contains no slots. + fn is_empty(self) -> bool { + self.0 == 0 + } + + /// Returns an iterator over all of the set bits in this set. + fn iter(self) -> SlotsIter { + SlotsIter { slots: self } + } + + /// For the position `at` in the current haystack, copy it to + /// `caller_explicit_slots` for all slots that are in this set. + /// + /// Callers may pass a slice of any length. Slots in this set bigger than + /// the length of the given explicit slots are simply skipped. + /// + /// The slice *must* correspond only to the explicit slots and the first + /// element of the slice must always correspond to the first explicit slot + /// in the corresponding NFA. + fn apply( + self, + at: usize, + caller_explicit_slots: &mut [Option<NonMaxUsize>], + ) { + if self.is_empty() { + return; + } + let at = NonMaxUsize::new(at); + for slot in self.iter() { + if slot >= caller_explicit_slots.len() { + break; + } + caller_explicit_slots[slot] = at; + } + } +} + +impl core::fmt::Debug for Slots { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "S")?; + for slot in self.iter() { + write!(f, "-{:?}", slot)?; + } + Ok(()) + } +} + +/// An iterator over all of the bits set in a slot set. +/// +/// This returns the bit index that is set, so callers may need to offset it +/// to get the actual NFA slot index. +#[derive(Debug)] +struct SlotsIter { + slots: Slots, +} + +impl Iterator for SlotsIter { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + // Number of zeroes here is always <= u8::MAX, and so fits in a usize. + let slot = self.slots.0.trailing_zeros().as_usize(); + if slot >= Slots::LIMIT { + return None; + } + self.slots = self.slots.remove(slot); + Some(slot) + } +} + +/// An error that occurred during the construction of a one-pass DFA. +/// +/// This error does not provide many introspection capabilities. There are +/// generally only two things you can do with it: +/// +/// * Obtain a human readable message via its `std::fmt::Display` impl. +/// * Access an underlying [`thompson::BuildError`] type from its `source` +/// method via the `std::error::Error` trait. This error only occurs when using +/// convenience routines for building a one-pass DFA directly from a pattern +/// string. +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, +} + +/// The kind of error that occurred during the construction of a one-pass DFA. +#[derive(Clone, Debug)] +enum BuildErrorKind { + NFA(crate::nfa::thompson::BuildError), + Word(UnicodeWordBoundaryError), + TooManyStates { limit: u64 }, + TooManyPatterns { limit: u64 }, + UnsupportedLook { look: Look }, + ExceededSizeLimit { limit: usize }, + NotOnePass { msg: &'static str }, +} + +impl BuildError { + fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError { + BuildError { kind: BuildErrorKind::NFA(err) } + } + + fn word(err: UnicodeWordBoundaryError) -> BuildError { + BuildError { kind: BuildErrorKind::Word(err) } + } + + fn too_many_states(limit: u64) -> BuildError { + BuildError { kind: BuildErrorKind::TooManyStates { limit } } + } + + fn too_many_patterns(limit: u64) -> BuildError { + BuildError { kind: BuildErrorKind::TooManyPatterns { limit } } + } + + fn unsupported_look(look: Look) -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedLook { look } } + } + + fn exceeded_size_limit(limit: usize) -> BuildError { + BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } + } + + fn not_one_pass(msg: &'static str) -> BuildError { + BuildError { kind: BuildErrorKind::NotOnePass { msg } } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + use self::BuildErrorKind::*; + + match self.kind { + NFA(ref err) => Some(err), + Word(ref err) => Some(err), + _ => None, + } + } +} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + use self::BuildErrorKind::*; + + match self.kind { + NFA(_) => write!(f, "error building NFA"), + Word(_) => write!(f, "NFA contains Unicode word boundary"), + TooManyStates { limit } => write!( + f, + "one-pass DFA exceeded a limit of {:?} for number of states", + limit, + ), + TooManyPatterns { limit } => write!( + f, + "one-pass DFA exceeded a limit of {:?} for number of patterns", + limit, + ), + UnsupportedLook { look } => write!( + f, + "one-pass DFA does not support the {:?} assertion", + look, + ), + ExceededSizeLimit { limit } => write!( + f, + "one-pass DFA exceeded size limit of {:?} during building", + limit, + ), + NotOnePass { msg } => write!( + f, + "one-pass DFA could not be built because \ + pattern is not one-pass: {}", + msg, + ), + } + } +} + +#[cfg(all(test, feature = "syntax"))] +mod tests { + use alloc::string::ToString; + + use super::*; + + #[test] + fn fail_conflicting_transition() { + let predicate = |err: &str| err.contains("conflicting transition"); + + let err = DFA::new(r"a*[ab]").unwrap_err().to_string(); + assert!(predicate(&err), "{}", err); + } + + #[test] + fn fail_multiple_epsilon() { + let predicate = |err: &str| { + err.contains("multiple epsilon transitions to same state") + }; + + let err = DFA::new(r"(^|$)a").unwrap_err().to_string(); + assert!(predicate(&err), "{}", err); + } + + #[test] + fn fail_multiple_match() { + let predicate = |err: &str| { + err.contains("multiple epsilon transitions to match state") + }; + + let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string(); + assert!(predicate(&err), "{}", err); + } + + // This test is meant to build a one-pass regex with the maximum number of + // possible slots. + // + // NOTE: Remember that the slot limit only applies to explicit capturing + // groups. Any number of implicit capturing groups is supported (up to the + // maximum number of supported patterns), since implicit groups are handled + // by the search loop itself. + #[test] + fn max_slots() { + // One too many... + let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)"; + assert!(DFA::new(pat).is_err()); + // Just right. + let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)"; + assert!(DFA::new(pat).is_ok()); + } + + // This test ensures that the one-pass DFA works with all look-around + // assertions that we expect it to work with. + // + // The utility of this test is that each one-pass transition has a small + // amount of space to store look-around assertions. Currently, there is + // logic in the one-pass constructor to ensure there aren't more than ten + // possible assertions. And indeed, there are only ten possible assertions + // (at time of writing), so this is okay. But conceivably, more assertions + // could be added. So we check that things at least work with what we + // expect them to work with. + #[test] + fn assertions() { + // haystack anchors + assert!(DFA::new(r"^").is_ok()); + assert!(DFA::new(r"$").is_ok()); + + // line anchors + assert!(DFA::new(r"(?m)^").is_ok()); + assert!(DFA::new(r"(?m)$").is_ok()); + assert!(DFA::new(r"(?Rm)^").is_ok()); + assert!(DFA::new(r"(?Rm)$").is_ok()); + + // word boundaries + if cfg!(feature = "unicode-word-boundary") { + assert!(DFA::new(r"\b").is_ok()); + assert!(DFA::new(r"\B").is_ok()); + } + assert!(DFA::new(r"(?-u)\b").is_ok()); + assert!(DFA::new(r"(?-u)\B").is_ok()); + } + + #[cfg(not(miri))] // takes too long on miri + #[test] + fn is_one_pass() { + use crate::util::syntax; + + assert!(DFA::new(r"a*b").is_ok()); + if cfg!(feature = "unicode-perl") { + assert!(DFA::new(r"\w").is_ok()); + } + assert!(DFA::new(r"(?-u)\w*\s").is_ok()); + assert!(DFA::new(r"(?s:.)*?").is_ok()); + assert!(DFA::builder() + .syntax(syntax::Config::new().utf8(false)) + .build(r"(?s-u:.)*?") + .is_ok()); + } + + #[test] + fn is_not_one_pass() { + assert!(DFA::new(r"a*a").is_err()); + assert!(DFA::new(r"(?s-u:.)*?").is_err()); + assert!(DFA::new(r"(?s:.)*?a").is_err()); + } + + #[cfg(not(miri))] + #[test] + fn is_not_one_pass_bigger() { + assert!(DFA::new(r"\w*\s").is_err()); + } +} diff --git a/vendor/regex-automata/src/dfa/regex.rs b/vendor/regex-automata/src/dfa/regex.rs index d0917e17d..f39c1c055 100644 --- a/vendor/regex-automata/src/dfa/regex.rs +++ b/vendor/regex-automata/src/dfa/regex.rs @@ -18,16 +18,17 @@ See the [parent module](crate::dfa) for examples. #[cfg(feature = "alloc")] use alloc::vec::Vec; +#[cfg(feature = "dfa-build")] +use crate::dfa::dense::BuildError; use crate::{ - dfa::automaton::{Automaton, OverlappingState}, - util::prefilter::{self, Prefilter}, - MatchError, MultiMatch, + dfa::{automaton::Automaton, dense}, + util::{iter, search::Input}, + Anchored, Match, MatchError, }; #[cfg(feature = "alloc")] use crate::{ - dfa::{dense, error::Error, sparse}, - nfa::thompson, - util::matchtypes::MatchKind, + dfa::{sparse, StartKind}, + util::search::MatchKind, }; // When the alloc feature is enabled, the regex type sets its A type parameter @@ -42,20 +43,16 @@ macro_rules! define_regex_type { ($(#[$doc:meta])*) => { #[cfg(feature = "alloc")] $(#[$doc])* - pub struct Regex<A = dense::OwnedDFA, P = prefilter::None> { - prefilter: Option<P>, + pub struct Regex<A = dense::OwnedDFA> { forward: A, reverse: A, - utf8: bool, } #[cfg(not(feature = "alloc"))] $(#[$doc])* - pub struct Regex<A, P = prefilter::None> { - prefilter: Option<P>, + pub struct Regex<A> { forward: A, reverse: A, - utf8: bool, } }; } @@ -79,86 +76,26 @@ define_regex_type!( /// memory but search faster, while sparse DFAs use less memory but search /// more slowly. /// + /// # Crate features + /// + /// Note that despite what the documentation auto-generates, the _only_ + /// crate feature needed to use this type is `dfa-search`. You do _not_ + /// need to enable the `alloc` feature. + /// /// By default, a regex's automaton type parameter is set to /// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most /// in-memory work loads, this is the most convenient type that gives the /// best search performance. When the `alloc` feature is disabled, no /// default type is used. /// - /// A `Regex` also has a `P` type parameter, which is used to select the - /// prefilter used during search. By default, no prefilter is enabled by - /// setting the type to default to [`prefilter::None`]. A prefilter can be - /// enabled by using the [`Regex::prefilter`] method. - /// /// # When should I use this? /// /// Generally speaking, if you can afford the overhead of building a full /// DFA for your regex, and you don't need things like capturing groups, /// then this is a good choice if you're looking to optimize for matching /// speed. Note however that its speed may be worse than a general purpose - /// regex engine if you don't select a good [prefilter]. - /// - /// # Earliest vs Leftmost vs Overlapping - /// - /// The search routines exposed on a `Regex` reflect three different ways - /// of searching: - /// - /// * "earliest" means to stop as soon as a match has been detected. - /// * "leftmost" means to continue matching until the underlying - /// automaton cannot advance. This reflects "standard" searching you - /// might be used to in other regex engines. e.g., This permits - /// non-greedy and greedy searching to work as you would expect. - /// * "overlapping" means to find all possible matches, even if they - /// overlap. - /// - /// Generally speaking, when doing an overlapping search, you'll want to - /// build your regex DFAs with [`MatchKind::All`] semantics. Using - /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is - /// likely to lead to odd behavior since `LeftmostFirst` specifically omits - /// some matches that can never be reported due to its semantics. - /// - /// The following example shows the differences between how these different - /// types of searches impact looking for matches of `[a-z]+` in the - /// haystack `abc`. - /// - /// ``` - /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; - /// - /// let pattern = r"[a-z]+"; - /// let haystack = "abc".as_bytes(); - /// - /// // With leftmost-first semantics, we test "earliest" and "leftmost". - /// let re = dfa::regex::Builder::new() - /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) - /// .build(pattern)?; - /// - /// // "earliest" searching isn't impacted by greediness - /// let mut it = re.find_earliest_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// // "leftmost" searching supports greediness (and non-greediness) - /// let mut it = re.find_leftmost_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// // For overlapping, we want "all" match kind semantics. - /// let re = dfa::regex::Builder::new() - /// .dense(dense::Config::new().match_kind(MatchKind::All)) - /// .build(pattern)?; - /// - /// // In the overlapping search, we find all three possible matches - /// // starting at the beginning of the haystack. - /// let mut it = re.find_overlapping_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` + /// regex engine if you don't provide a [`dense::Config::prefilter`] to the + /// underlying DFA. /// /// # Sparse DFAs /// @@ -203,18 +140,16 @@ define_regex_type!( /// /// # Fallibility /// - /// In non-default configurations, the DFAs generated in this module may - /// return an error during a search. (Currently, the only way this happens - /// is if quit bytes are added or Unicode word boundaries are heuristically - /// enabled, both of which are turned off by default.) For convenience, the - /// main search routines, like [`find_leftmost`](Regex::find_leftmost), - /// will panic if an error occurs. However, if you need to use DFAs - /// which may produce an error at search time, then there are fallible - /// equivalents of all search routines. For example, for `find_leftmost`, - /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). - /// The routines prefixed with `try_` return `Result<Option<MultiMatch>, - /// MatchError>`, where as the infallible routines simply return - /// `Option<MultiMatch>`. + /// Most of the search routines defined on this type will _panic_ when the + /// underlying search fails. This might be because the DFA gave up because + /// it saw a quit byte, whether configured explicitly or via heuristic + /// Unicode word boundary support, although neither are enabled by default. + /// Or it might fail because an invalid `Input` configuration is given, + /// for example, with an unsupported [`Anchored`] mode. + /// + /// If you need to handle these error cases instead of allowing them to + /// trigger a panic, then the lower level [`Regex::try_search`] provides + /// a fallible API that never panics. /// /// # Example /// @@ -224,18 +159,19 @@ define_regex_type!( /// across a line boundary. /// /// ``` - /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError}; /// /// let re = Regex::builder() /// .dense(dfa::dense::Config::new().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// - /// let haystack = "foo\nbar".as_bytes(); + /// let input = Input::new("foo\nbar"); /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. - /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; - /// let got = re.try_find_leftmost(haystack).unwrap_err(); + /// let expected = MatchError::quit(b'\n', 3); + /// let got = re.try_search(&input).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -243,7 +179,7 @@ define_regex_type!( #[derive(Clone, Debug)] ); -#[cfg(feature = "alloc")] +#[cfg(all(feature = "syntax", feature = "dfa-build"))] impl Regex { /// Parse the given regular expression using the default configuration and /// return the corresponding regex. @@ -254,16 +190,16 @@ impl Regex { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+bar")?; /// assert_eq!( - /// Some(MultiMatch::must(0, 3, 14)), - /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// Some(Match::must(0, 3..14)), + /// re.find(b"zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn new(pattern: &str) -> Result<Regex, Error> { + pub fn new(pattern: &str) -> Result<Regex, BuildError> { Builder::new().build(pattern) } @@ -273,26 +209,28 @@ impl Regex { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; /// - /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<Regex, Error> { + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex, BuildError> { Builder::new().build_many(patterns) } } -#[cfg(feature = "alloc")] +#[cfg(all(feature = "syntax", feature = "dfa-build"))] impl Regex<sparse::DFA<Vec<u8>>> { /// Parse the given regular expression using the default configuration, /// except using sparse DFAs, and return the corresponding regex. @@ -303,18 +241,18 @@ impl Regex<sparse::DFA<Vec<u8>>> { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_sparse("foo[0-9]+bar")?; /// assert_eq!( - /// Some(MultiMatch::must(0, 3, 14)), - /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// Some(Match::must(0, 3..14)), + /// re.find(b"zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn new_sparse( pattern: &str, - ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { Builder::new().build_sparse(pattern) } @@ -325,64 +263,29 @@ impl Regex<sparse::DFA<Vec<u8>>> { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; /// - /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn new_many_sparse<P: AsRef<str>>( patterns: &[P], - ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { Builder::new().build_many_sparse(patterns) } } /// Convenience routines for regex construction. -#[cfg(feature = "alloc")] -impl Regex { - /// Return a default configuration for a `Regex`. - /// - /// This is a convenience routine to avoid needing to import the `Config` - /// type when customizing the construction of a regex. - /// - /// # Example - /// - /// This example shows how to disable UTF-8 mode for `Regex` iteration. - /// When UTF-8 mode is disabled, the position immediately following an - /// empty match is where the next search begins, instead of the next - /// position of a UTF-8 encoded codepoint. - /// - /// ``` - /// use regex_automata::{dfa::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .build(r"")?; - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn config() -> Config { - Config::new() - } - +impl Regex<dense::DFA<&'static [u32]>> { /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the @@ -394,20 +297,18 @@ impl Regex { /// everywhere. /// /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ - /// dfa::regex::Regex, - /// nfa::thompson, - /// MultiMatch, SyntaxConfig, + /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .syntax(SyntaxConfig::new().utf8(false)) + /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; - /// let expected = Some(MultiMatch::must(0, 1, 9)); - /// let got = re.find_leftmost(haystack); + /// let expected = Some(Match::must(0, 1..9)); + /// let got = re.find(haystack); /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -418,7 +319,7 @@ impl Regex { } /// Standard search routines for finding and iterating over matches. -impl<A: Automaton, P: Prefilter> Regex<A, P> { +impl<A: Automaton> Regex<A> { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input @@ -428,65 +329,37 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> { /// /// # Panics /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_is_match`](Regex::try_is_match). - /// - /// # Example - /// - /// ``` - /// use regex_automata::dfa::regex::Regex; + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: /// - /// let re = Regex::new("foo[0-9]+bar")?; - /// assert_eq!(true, re.is_match(b"foo12345bar")); - /// assert_eq!(false, re.is_match(b"foobar")); - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn is_match(&self, haystack: &[u8]) -> bool { - self.is_match_at(haystack, 0, haystack.len()) - } - - /// Returns the first position at which a match is found. + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. + /// When a search panics, callers cannot know whether a match exists or + /// not. /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_earliest`](Regex::try_find_earliest). + /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; - /// - /// // Normally, the leftmost first match would greedily consume as many - /// // decimal digits as it could. But a match is detected as soon as one - /// // digit is seen. - /// let re = Regex::new("foo[0-9]+")?; - /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 4)), - /// re.find_earliest(b"foo12345"), - /// ); + /// use regex_automata::dfa::regex::Regex; /// - /// // Normally, the end of the leftmost first match here would be 3, - /// // but the "earliest" match semantics detect a match earlier. - /// let re = Regex::new("abc|a")?; - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc")); + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!(true, re.is_match("foo12345bar")); + /// assert_eq!(false, re.is_match("foobar")); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn find_earliest(&self, haystack: &[u8]) -> Option<MultiMatch> { - self.find_earliest_at(haystack, 0, haystack.len()) + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool { + // Not only can we do an "earliest" search, but we can avoid doing a + // reverse scan too. + let input = input.into().earliest(true); + self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap() } /// Returns the start and end offset of the leftmost match. If no match @@ -494,131 +367,41 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> { /// /// # Panics /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. /// - /// The fallible version of this routine is - /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// - /// // Greediness is applied appropriately when compared to find_earliest. + /// // Greediness is applied appropriately. /// let re = Regex::new("foo[0-9]+")?; - /// assert_eq!( - /// Some(MultiMatch::must(0, 3, 11)), - /// re.find_leftmost(b"zzzfoo12345zzz"), - /// ); + /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the default leftmost-first match semantics demand that we find the /// // earliest match that prefers earlier parts of the pattern over latter /// // parts. /// let re = Regex::new("abc|a")?; - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc")); - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn find_leftmost(&self, haystack: &[u8]) -> Option<MultiMatch> { - self.find_leftmost_at(haystack, 0, haystack.len()) - } - - /// Search for the first overlapping match in `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping`](Regex::try_find_overlapping). - /// - /// # Example - /// - /// This example shows how to run an overlapping search with multiple - /// regexes. - /// - /// ``` - /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; - /// - /// let re = Regex::builder() - /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) - /// .build_many(&[r"\w+$", r"\S+$"])?; - /// let haystack = "@foo".as_bytes(); - /// let mut state = dfa::OverlappingState::start(); - /// - /// let expected = Some(MultiMatch::must(1, 0, 4)); - /// let got = re.find_overlapping(haystack, &mut state); - /// assert_eq!(expected, got); - /// - /// // The first pattern also matches at the same position, so re-running - /// // the search will yield another match. Notice also that the first - /// // pattern is returned after the second. This is because the second - /// // pattern begins its match before the first, is therefore an earlier - /// // match and is thus reported first. - /// let expected = Some(MultiMatch::must(0, 1, 4)); - /// let got = re.find_overlapping(haystack, &mut state); - /// assert_eq!(expected, got); - /// + /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc")); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn find_overlapping( - &self, - haystack: &[u8], - state: &mut OverlappingState, - ) -> Option<MultiMatch> { - self.find_overlapping_at(haystack, 0, haystack.len(), state) - } - - /// Returns an iterator over all non-overlapping "earliest" matches. - /// - /// Match positions are reported as soon as a match is known to occur, even - /// if the standard leftmost match would be longer. - /// - /// # Panics - /// - /// If the underlying DFAs return an error during iteration, then iteration - /// panics. This only occurs in non-default configurations where quit bytes - /// are used or Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). - /// - /// # Example - /// - /// This example shows how to run an "earliest" iterator. - /// - /// ``` - /// use regex_automata::{dfa::regex::Regex, MultiMatch}; - /// - /// let re = Regex::new("[0-9]+")?; - /// let haystack = "123".as_bytes(); - /// - /// // Normally, a standard leftmost iterator would return a single - /// // match, but since "earliest" detects matches earlier, we get - /// // three matches. - /// let mut it = re.find_earliest_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn find_earliest_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> FindEarliestMatches<'r, 't, A, P> { - FindEarliestMatches::new(self, haystack) + #[inline] + pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> { + self.try_search(&input.into()).unwrap() } /// Returns an iterator over all non-overlapping leftmost matches in the @@ -628,621 +411,119 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> { /// /// # Panics /// - /// If the underlying DFAs return an error during iteration, then iteration - /// panics. This only occurs in non-default configurations where quit bytes - /// are used or Unicode word boundaries are heuristically enabled. + /// If the search returns an error during iteration, then iteration + /// panics. See [`Regex::find`] for the panic conditions. /// - /// The fallible version of this routine is - /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// Use [`Regex::try_search`] with + /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to + /// handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+")?; - /// let text = b"foo1 foo12 foo123"; - /// let matches: Vec<MultiMatch> = re.find_leftmost_iter(text).collect(); + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(text).collect(); /// assert_eq!(matches, vec![ - /// MultiMatch::must(0, 0, 4), - /// MultiMatch::must(0, 5, 10), - /// MultiMatch::must(0, 11, 17), + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn find_leftmost_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> FindLeftmostMatches<'r, 't, A, P> { - FindLeftmostMatches::new(self, haystack) - } - - /// Returns an iterator over all overlapping matches in the given haystack. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// The iterator takes care of handling the overlapping state that must be - /// threaded through every search. - /// - /// # Panics - /// - /// If the underlying DFAs return an error during iteration, then iteration - /// panics. This only occurs in non-default configurations where quit bytes - /// are used or Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). - /// - /// # Example - /// - /// This example shows how to run an overlapping search with multiple - /// regexes. - /// - /// ``` - /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; - /// - /// let re = Regex::builder() - /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) - /// .build_many(&[r"\w+$", r"\S+$"])?; - /// let haystack = "@foo".as_bytes(); - /// - /// let mut it = re.find_overlapping_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn find_overlapping_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> FindOverlappingMatches<'r, 't, A, P> { - FindOverlappingMatches::new(self, haystack) - } -} - -/// Lower level infallible search routines that permit controlling where -/// the search starts and ends in a particular sequence. This is useful for -/// executing searches that need to take surrounding context into account. This -/// is required for correctly implementing iteration because of look-around -/// operators (`^`, `$`, `\b`). -impl<A: Automaton, P: Prefilter> Regex<A, P> { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_is_match_at`](Regex::try_is_match_at). - pub fn is_match_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> bool { - self.try_is_match_at(haystack, start, end).unwrap() - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_earliest_at`](Regex::try_find_earliest_at). - pub fn find_earliest_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Option<MultiMatch> { - self.try_find_earliest_at(haystack, start, end).unwrap() - } - - /// Returns the same as `find_leftmost`, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, if the DFA is anchored, then - /// a match can only occur when `start == 0`. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches within the - /// same haystack, which cannot be done correctly by simply providing a - /// subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). - pub fn find_leftmost_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Option<MultiMatch> { - self.try_find_leftmost_at(haystack, start, end).unwrap() - } - - /// Search for the first overlapping match within a given range of - /// `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). - pub fn find_overlapping_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Option<MultiMatch> { - self.try_find_overlapping_at(haystack, start, end, state).unwrap() - } -} - -/// Fallible search routines. These may return an error when the underlying -/// DFAs have been configured in a way that permits them to fail during a -/// search. -/// -/// Errors during search only occur when the DFA has been explicitly -/// configured to do so, usually by specifying one or more "quit" bytes or by -/// heuristically enabling Unicode word boundaries. -/// -/// Errors will never be returned using the default configuration. So these -/// fallible routines are only needed for particular configurations. -impl<A: Automaton, P: Prefilter> Regex<A, P> { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`is_match`](Regex::is_match). - pub fn try_is_match(&self, haystack: &[u8]) -> Result<bool, MatchError> { - self.try_is_match_at(haystack, 0, haystack.len()) - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest`](Regex::find_earliest). - pub fn try_find_earliest( - &self, - haystack: &[u8], - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_earliest_at(haystack, 0, haystack.len()) - } - - /// Returns the start and end offset of the leftmost match. If no match - /// exists, then `None` is returned. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost`](Regex::find_leftmost). - pub fn try_find_leftmost( - &self, - haystack: &[u8], - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_leftmost_at(haystack, 0, haystack.len()) - } - - /// Search for the first overlapping match in `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping`](Regex::find_overlapping). - pub fn try_find_overlapping( - &self, - haystack: &[u8], - state: &mut OverlappingState, - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_overlapping_at(haystack, 0, haystack.len(), state) - } - - /// Returns an iterator over all non-overlapping "earliest" matches. - /// - /// Match positions are reported as soon as a match is known to occur, even - /// if the standard leftmost match would be longer. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest_iter`](Regex::find_earliest_iter). - pub fn try_find_earliest_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> TryFindEarliestMatches<'r, 't, A, P> { - TryFindEarliestMatches::new(self, haystack) - } - - /// Returns an iterator over all non-overlapping leftmost matches in the - /// given bytes. If no match exists, then the iterator yields no elements. - /// - /// This corresponds to the "standard" regex search iterator. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost_iter`](Regex::find_leftmost_iter). - pub fn try_find_leftmost_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> TryFindLeftmostMatches<'r, 't, A, P> { - TryFindLeftmostMatches::new(self, haystack) - } - - /// Returns an iterator over all overlapping matches in the given haystack. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// The iterator takes care of handling the overlapping state that must be - /// threaded through every search. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping_iter`](Regex::find_overlapping_iter). - pub fn try_find_overlapping_iter<'r, 't>( + #[inline] + pub fn find_iter<'r, 'h, I: Into<Input<'h>>>( &'r self, - haystack: &'t [u8], - ) -> TryFindOverlappingMatches<'r, 't, A, P> { - TryFindOverlappingMatches::new(self, haystack) + input: I, + ) -> FindMatches<'r, 'h, A> { + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, it } } } /// Lower level fallible search routines that permit controlling where the /// search starts and ends in a particular sequence. -impl<A: Automaton, P: Prefilter> Regex<A, P> { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`is_match_at`](Regex::is_match_at). - pub fn try_is_match_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<bool, MatchError> { - self.forward() - .find_earliest_fwd_at( - self.scanner().as_mut(), - None, - haystack, - start, - end, - ) - .map(|x| x.is_some()) - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest_at`](Regex::find_earliest_at). - pub fn try_find_earliest_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_earliest_at_imp( - self.scanner().as_mut(), - haystack, - start, - end, - ) - } - - /// The implementation of "earliest" searching, where a prefilter scanner - /// may be given. - fn try_find_earliest_at_imp( - &self, - pre: Option<&mut prefilter::Scanner>, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<Option<MultiMatch>, MatchError> { - // N.B. We use `&&A` here to call `Automaton` methods, which ensures - // that we always use the `impl Automaton for &A` for calling methods. - // Since this is the usual way that automata are used, this helps - // reduce the number of monomorphized copies of the search code. - let (fwd, rev) = (self.forward(), self.reverse()); - let end = match (&fwd) - .find_earliest_fwd_at(pre, None, haystack, start, end)? - { - None => return Ok(None), - Some(end) => end, - }; - // N.B. The only time we need to tell the reverse searcher the pattern - // to match is in the overlapping case, since it's ambiguous. In the - // leftmost case, I have tentatively convinced myself that it isn't - // necessary and the reverse search will always find the same pattern - // to match as the forward search. But I lack a rigorous proof. - let start = (&rev) - .find_earliest_rev_at(None, haystack, start, end.offset())? - .expect("reverse search must match if forward search does"); - assert_eq!( - start.pattern(), - end.pattern(), - "forward and reverse search must match same pattern" - ); - assert!(start.offset() <= end.offset()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) - } - +impl<A: Automaton> Regex<A> { /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// - /// # Searching a substring of the haystack + /// This is like [`Regex::find`] but with two differences: /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. + /// 1. It is not generic over `Into<Input>` and instead accepts a + /// `&Input`. This permits reusing the same `Input` for multiple searches + /// without needing to create a new one. This _may_ help with latency. + /// 2. It returns an error if the search could not complete where as + /// [`Regex::find`] will panic. /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. + /// This routine errors if the search could not complete. This can occur + /// in the following circumstances: /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost_at`](Regex::find_leftmost_at). - pub fn try_find_leftmost_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_leftmost_at_imp( - self.scanner().as_mut(), - haystack, - start, - end, - ) - } - - /// The implementation of leftmost searching, where a prefilter scanner - /// may be given. - fn try_find_leftmost_at_imp( + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + #[inline] + pub fn try_search( &self, - scanner: Option<&mut prefilter::Scanner>, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<Option<MultiMatch>, MatchError> { - // N.B. We use `&&A` here to call `Automaton` methods, which ensures - // that we always use the `impl Automaton for &A` for calling methods. - // Since this is the usual way that automata are used, this helps - // reduce the number of monomorphized copies of the search code. + input: &Input<'_>, + ) -> Result<Option<Match>, MatchError> { let (fwd, rev) = (self.forward(), self.reverse()); - let end = match (&fwd) - .find_leftmost_fwd_at(scanner, None, haystack, start, end)? - { + let end = match fwd.try_search_fwd(input)? { None => return Ok(None), Some(end) => end, }; - // N.B. The only time we need to tell the reverse searcher the pattern - // to match is in the overlapping case, since it's ambiguous. In the - // leftmost case, I have tentatively convinced myself that it isn't - // necessary and the reverse search will always find the same pattern - // to match as the forward search. But I lack a rigorous proof. Why not - // just provide the pattern anyway? Well, if it is needed, then leaving - // it out gives us a chance to find a witness. - let start = (&rev) - .find_leftmost_rev_at(None, haystack, start, end.offset())? + // This special cases an empty match at the beginning of the search. If + // our end matches our start, then since a reverse DFA can't match past + // the start, it must follow that our starting position is also our end + // position. So short circuit and skip the reverse search. + if input.start() == end.offset() { + return Ok(Some(Match::new( + end.pattern(), + end.offset()..end.offset(), + ))); + } + // We can also skip the reverse search if we know our search was + // anchored. This occurs either when the input config is anchored or + // when we know the regex itself is anchored. In this case, we know the + // start of the match, if one is found, must be the start of the + // search. + if self.is_anchored(input) { + return Ok(Some(Match::new( + end.pattern(), + input.start()..end.offset(), + ))); + } + // N.B. I have tentatively convinced myself that it isn't necessary + // to specify the specific pattern for the reverse search since the + // reverse search will always find the same pattern to match as the + // forward search. But I lack a rigorous proof. Why not just provide + // the pattern anyway? Well, if it is needed, then leaving it out + // gives us a chance to find a witness. (Also, if we don't need to + // specify the pattern, then we don't need to build the reverse DFA + // with 'starts_for_each_pattern' enabled.) + // + // We also need to be careful to disable 'earliest' for the reverse + // search, since it could be enabled for the forward search. In the + // reverse case, to satisfy "leftmost" criteria, we need to match + // as much as we can. We also need to be careful to make the search + // anchored. We don't want the reverse search to report any matches + // other than the one beginning at the end of our forward search. + let revsearch = input + .clone() + .span(input.start()..end.offset()) + .anchored(Anchored::Yes) + .earliest(false); + let start = rev + .try_search_rev(&revsearch)? .expect("reverse search must match if forward search does"); assert_eq!( start.pattern(), @@ -1250,132 +531,22 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> { "forward and reverse search must match same pattern", ); assert!(start.offset() <= end.offset()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) - } - - /// Search for the first overlapping match within a given range of - /// `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping_at`](Regex::find_overlapping_at). - pub fn try_find_overlapping_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_overlapping_at_imp( - self.scanner().as_mut(), - haystack, - start, - end, - state, - ) + Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) } - /// The implementation of overlapping search at a given range in - /// `haystack`, where `scanner` is a prefilter (if active) and `state` is - /// the current state of the search. - fn try_find_overlapping_at_imp( - &self, - scanner: Option<&mut prefilter::Scanner>, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result<Option<MultiMatch>, MatchError> { - // N.B. We use `&&A` here to call `Automaton` methods, which ensures - // that we always use the `impl Automaton for &A` for calling methods. - // Since this is the usual way that automata are used, this helps - // reduce the number of monomorphized copies of the search code. - let (fwd, rev) = (self.forward(), self.reverse()); - // TODO: Decide whether it's worth making this assert work. It doesn't - // work currently because 'has_starts_for_each_pattern' isn't on the - // Automaton trait. Without this assert, we still get a panic, but it's - // a bit more inscrutable. - // assert!( - // rev.has_starts_for_each_pattern(), - // "overlapping searches require that the reverse DFA is \ - // compiled with the 'starts_for_each_pattern' option", - // ); - let end = match (&fwd).find_overlapping_fwd_at( - scanner, None, haystack, start, end, state, - )? { - None => return Ok(None), - Some(end) => end, - }; - // Unlike the leftmost cases, the reverse overlapping search may match - // a different pattern than the forward search. See test failures when - // using `None` instead of `Some(end.pattern())` below. Thus, we must - // run our reverse search using the pattern that matched in the forward - // direction. - let start = (&rev) - .find_leftmost_rev_at( - Some(end.pattern()), - haystack, - 0, - end.offset(), - )? - .expect("reverse search must match if forward search does"); - assert!(start.offset() <= end.offset()); - assert_eq!(start.pattern(), end.pattern()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + /// Returns true if either the given input specifies an anchored search + /// or if the underlying DFA is always anchored. + fn is_anchored(&self, input: &Input<'_>) -> bool { + match input.get_anchored() { + Anchored::No => self.forward().is_always_start_anchored(), + Anchored::Yes | Anchored::Pattern(_) => true, + } } } /// Non-search APIs for querying information about the regex and setting a /// prefilter. -impl<A: Automaton, P: Prefilter> Regex<A, P> { - /// Attach the given prefilter to this regex. - pub fn with_prefilter<Q: Prefilter>(self, prefilter: Q) -> Regex<A, Q> { - Regex { - prefilter: Some(prefilter), - forward: self.forward, - reverse: self.reverse, - utf8: self.utf8, - } - } - - /// Remove any prefilter from this regex. - pub fn without_prefilter(self) -> Regex<A> { - Regex { - prefilter: None, - forward: self.forward, - reverse: self.reverse, - utf8: self.utf8, - } - } - +impl<A: Automaton> Regex<A> { /// Return the underlying DFA responsible for forward matching. /// /// This is useful for accessing the underlying DFA and converting it to @@ -1399,471 +570,48 @@ impl<A: Automaton, P: Prefilter> Regex<A, P> { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::dfa::regex::Regex; /// /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; - /// assert_eq!(3, re.pattern_count()); + /// assert_eq!(3, re.pattern_len()); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn pattern_count(&self) -> usize { - assert_eq!( - self.forward().pattern_count(), - self.reverse().pattern_count() - ); - self.forward().pattern_count() - } - - /// Convenience function for returning this regex's prefilter as a trait - /// object. - /// - /// If this regex doesn't have a prefilter, then `None` is returned. - pub fn prefilter(&self) -> Option<&dyn Prefilter> { - match self.prefilter { - None => None, - Some(ref x) => Some(&*x), - } - } - - /// Convenience function for returning a prefilter scanner. - fn scanner(&self) -> Option<prefilter::Scanner> { - self.prefilter().map(prefilter::Scanner::new) + pub fn pattern_len(&self) -> usize { + assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); + self.forward().pattern_len() } } -/// An iterator over all non-overlapping earliest matches for a particular -/// infallible search. +/// An iterator over all non-overlapping matches for an infallible search. /// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. +/// The iterator yields a [`Match`] value until no more matches could be found. +/// If the underlying regex engine returns an error, then a panic occurs. /// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: +/// The type parameters are as follows: /// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct FindEarliestMatches<'r, 't, A, P>( - TryFindEarliestMatches<'r, 't, A, P>, -); - -impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { - fn new( - re: &'r Regex<A, P>, - text: &'t [u8], - ) -> FindEarliestMatches<'r, 't, A, P> { - FindEarliestMatches(TryFindEarliestMatches::new(re, text)) - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for FindEarliestMatches<'r, 't, A, P> -{ - type Item = MultiMatch; - - fn next(&mut self) -> Option<MultiMatch> { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all non-overlapping leftmost matches for a particular -/// infallible search. +/// * `A` represents the type of the underlying DFA that implements the +/// [`Automaton`] trait. /// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. +/// The lifetime parameters are as follows: /// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: +/// * `'h` represents the lifetime of the haystack being searched. +/// * `'r` represents the lifetime of the regex object itself. /// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct FindLeftmostMatches<'r, 't, A, P>( - TryFindLeftmostMatches<'r, 't, A, P>, -); - -impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { - fn new( - re: &'r Regex<A, P>, - text: &'t [u8], - ) -> FindLeftmostMatches<'r, 't, A, P> { - FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for FindLeftmostMatches<'r, 't, A, P> -{ - type Item = MultiMatch; - - fn next(&mut self) -> Option<MultiMatch> { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all overlapping matches for a particular infallible -/// search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. -/// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( - TryFindOverlappingMatches<'r, 't, A, P>, -); - -impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { - fn new( - re: &'r Regex<A, P>, - text: &'t [u8], - ) -> FindOverlappingMatches<'r, 't, A, P> { - FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for FindOverlappingMatches<'r, 't, A, P> -{ - type Item = MultiMatch; - - fn next(&mut self) -> Option<MultiMatch> { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all non-overlapping earliest matches for a particular -/// fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct TryFindEarliestMatches<'r, 't, A, P> { - re: &'r Regex<A, P>, - scanner: Option<prefilter::Scanner<'r>>, - text: &'t [u8], - last_end: usize, - last_match: Option<usize>, -} - -impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { - fn new( - re: &'r Regex<A, P>, - text: &'t [u8], - ) -> TryFindEarliestMatches<'r, 't, A, P> { - let scanner = re.scanner(); - TryFindEarliestMatches { - re, - scanner, - text, - last_end: 0, - last_match: None, - } - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for TryFindEarliestMatches<'r, 't, A, P> -{ - type Item = Result<MultiMatch, MatchError>; - - fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_earliest_at_imp( - self.scanner.as_mut(), - self.text, - self.last_end, - self.text.len(), - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.is_empty() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = if self.re.utf8 { - crate::util::next_utf8(self.text, m.end()) - } else { - m.end() + 1 - }; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(m)) - } -} - -/// An iterator over all non-overlapping leftmost matches for a particular -/// fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct TryFindLeftmostMatches<'r, 't, A, P> { - re: &'r Regex<A, P>, - scanner: Option<prefilter::Scanner<'r>>, - text: &'t [u8], - last_end: usize, - last_match: Option<usize>, -} - -impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { - fn new( - re: &'r Regex<A, P>, - text: &'t [u8], - ) -> TryFindLeftmostMatches<'r, 't, A, P> { - let scanner = re.scanner(); - TryFindLeftmostMatches { - re, - scanner, - text, - last_end: 0, - last_match: None, - } - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for TryFindLeftmostMatches<'r, 't, A, P> -{ - type Item = Result<MultiMatch, MatchError>; - - fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_leftmost_at_imp( - self.scanner.as_mut(), - self.text, - self.last_end, - self.text.len(), - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.is_empty() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = if self.re.utf8 { - crate::util::next_utf8(self.text, m.end()) - } else { - m.end() + 1 - }; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(m)) - } -} - -/// An iterator over all overlapping matches for a particular fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { - re: &'r Regex<A, P>, - scanner: Option<prefilter::Scanner<'r>>, - text: &'t [u8], - last_end: usize, - state: OverlappingState, -} - -impl<'r, 't, A: Automaton, P: Prefilter> - TryFindOverlappingMatches<'r, 't, A, P> -{ - fn new( - re: &'r Regex<A, P>, - text: &'t [u8], - ) -> TryFindOverlappingMatches<'r, 't, A, P> { - let scanner = re.scanner(); - TryFindOverlappingMatches { - re, - scanner, - text, - last_end: 0, - state: OverlappingState::start(), - } - } +/// This iterator can be created with the [`Regex::find_iter`] method. +#[derive(Debug)] +pub struct FindMatches<'r, 'h, A> { + re: &'r Regex<A>, + it: iter::Searcher<'h>, } -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for TryFindOverlappingMatches<'r, 't, A, P> -{ - type Item = Result<MultiMatch, MatchError>; +impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> { + type Item = Match; - fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_overlapping_at_imp( - self.scanner.as_mut(), - self.text, - self.last_end, - self.text.len(), - &mut self.state, - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - // Unlike the non-overlapping case, we're OK with empty matches at this - // level. In particular, the overlapping search algorithm is itself - // responsible for ensuring that progress is always made. - self.last_end = m.end(); - Some(Ok(m)) - } -} - -/// The configuration used for compiling a DFA-backed regex. -/// -/// A regex configuration is a simple data object that is typically used with -/// [`Builder::configure`]. -#[cfg(feature = "alloc")] -#[derive(Clone, Copy, Debug, Default)] -pub struct Config { - utf8: Option<bool>, -} - -#[cfg(feature = "alloc")] -impl Config { - /// Return a new default regex compiler configuration. - pub fn new() -> Config { - Config::default() - } - - /// Whether to enable UTF-8 mode or not. - /// - /// When UTF-8 mode is enabled (the default) and an empty match is seen, - /// the iterators on [`Regex`] will always start the next search at the - /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 - /// mode is disabled, such searches are begun at the next byte offset. - /// - /// If this mode is enabled and invalid UTF-8 is given to search, then - /// behavior is unspecified. - /// - /// Generally speaking, one should enable this when - /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) - /// and - /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) - /// are enabled, and disable it otherwise. - /// - /// # Example - /// - /// This example demonstrates the differences between when this option is - /// enabled and disabled. The differences only arise when the regex can - /// return matches of length zero. - /// - /// In this first snippet, we show the results when UTF-8 mode is disabled. - /// - /// ``` - /// use regex_automata::{dfa::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .build(r"")?; - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - /// - /// And in this snippet, we execute the same search on the same haystack, - /// but with UTF-8 mode enabled. Notice that byte offsets that would - /// otherwise split the encoding of `☃` are not returned. - /// - /// ``` - /// use regex_automata::{dfa::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(true)) - /// .build(r"")?; - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn utf8(mut self, yes: bool) -> Config { - self.utf8 = Some(yes); - self - } - - /// Returns true if and only if this configuration has UTF-8 mode enabled. - /// - /// When UTF-8 mode is enabled and an empty match is seen, the iterators on - /// [`Regex`] will always start the next search at the next UTF-8 encoded - /// codepoint. When UTF-8 mode is disabled, such searches are begun at the - /// next byte offset. - pub fn get_utf8(&self) -> bool { - self.utf8.unwrap_or(true) - } - - /// Overwrite the default configuration such that the options in `o` are - /// always used. If an option in `o` is not set, then the corresponding - /// option in `self` is used. If it's not set in `self` either, then it - /// remains not set. - pub(crate) fn overwrite(self, o: Config) -> Config { - Config { utf8: o.utf8.or(self.utf8) } + #[inline] + fn next(&mut self) -> Option<Match> { + let FindMatches { re, ref mut it } = *self; + it.advance(|input| re.try_search(input)) } } @@ -1874,17 +622,15 @@ impl Config { /// itself. This builder is different from a general purpose regex builder in /// that it permits fine grain configuration of the construction process. The /// trade off for this is complexity, and the possibility of setting a -/// configuration that might not make sense. For example, there are three +/// configuration that might not make sense. For example, there are two /// different UTF-8 modes: /// -/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the -/// pattern itself can contain sub-expressions that match invalid UTF-8. -/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) -/// controls whether the implicit unanchored prefix added to the NFA can -/// match through invalid UTF-8 or not. -/// * [`Config::utf8`] controls how the regex iterators themselves advance -/// the starting position of the next search when a match with zero length is -/// found. +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls +/// how the regex iterators themselves advance the starting position of the +/// next search when a match with zero length is found. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. @@ -1919,57 +665,51 @@ impl Config { /// /// # Example /// -/// This example shows how to disable UTF-8 mode in the syntax, the NFA and -/// the regex itself. This is generally what you want for matching on -/// arbitrary bytes. +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ -/// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// dfa::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() -/// .configure(Regex::config().utf8(false)) -/// .syntax(SyntaxConfig::new().utf8(false)) +/// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; -/// let expected = Some(MultiMatch::must(0, 1, 9)); -/// let got = re.find_leftmost(haystack); +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find(haystack); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 -/// // on the syntax permits this. Notice also that the -/// // search was unanchored and skipped over invalid UTF-8. -/// // Disabling UTF-8 on the Thompson NFA permits this. -/// // -/// // N.B. This example does not show the impact of -/// // disabling UTF-8 mode on Config, since that -/// // only impacts regexes that can produce matches of -/// // length 0. +/// // on the syntax permits this. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` -#[cfg(feature = "alloc")] #[derive(Clone, Debug)] pub struct Builder { - config: Config, + #[cfg(feature = "dfa-build")] dfa: dense::Builder, } -#[cfg(feature = "alloc")] impl Builder { /// Create a new regex builder with the default configuration. pub fn new() -> Builder { - Builder { config: Config::default(), dfa: dense::Builder::new() } + Builder { + #[cfg(feature = "dfa-build")] + dfa: dense::Builder::new(), + } } /// Build a regex from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. - pub fn build(&self, pattern: &str) -> Result<Regex, Error> { + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { self.build_many(&[pattern]) } @@ -1977,38 +717,42 @@ impl Builder { /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_sparse( &self, pattern: &str, - ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { self.build_many_sparse(&[pattern]) } /// Build a regex from the given patterns. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_many<P: AsRef<str>>( &self, patterns: &[P], - ) -> Result<Regex, Error> { + ) -> Result<Regex, BuildError> { let forward = self.dfa.build_many(patterns)?; let reverse = self .dfa .clone() .configure( dense::Config::new() - .anchored(true) - .match_kind(MatchKind::All) - .starts_for_each_pattern(true), + .prefilter(None) + .specialize_start_states(false) + .start_kind(StartKind::Anchored) + .match_kind(MatchKind::All), ) - .thompson(thompson::Config::new().reverse(true)) + .thompson(crate::nfa::thompson::Config::new().reverse(true)) .build_many(patterns)?; Ok(self.build_from_dfas(forward, reverse)) } /// Build a sparse regex from the given patterns. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_many_sparse<P: AsRef<str>>( &self, patterns: &[P], - ) -> Result<Regex<sparse::DFA<Vec<u8>>>, Error> { + ) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> { let re = self.build_many(patterns)?; let forward = re.forward().to_sparse()?; let reverse = re.reverse().to_sparse()?; @@ -2028,16 +772,14 @@ impl Builder { /// * It should be anchored. /// * It should use [`MatchKind::All`] semantics. /// * It should match in reverse. - /// * It should have anchored start states compiled for each pattern. /// * Otherwise, its configuration should match the forward DFA. /// - /// If these conditions are satisfied, then behavior of searches is + /// If these conditions aren't satisfied, then the behavior of searches is /// unspecified. /// - /// Note that when using this constructor, only the configuration from - /// [`Config`] is applied. The only configuration settings on this builder - /// only apply when the builder owns the construction of the DFAs - /// themselves. + /// Note that when using this constructor, no configuration is applied. + /// Since this routine provides the DFAs to the builder, there is no + /// opportunity to apply other configuration options. /// /// # Example /// @@ -2079,35 +821,33 @@ impl Builder { forward: A, reverse: A, ) -> Regex<A> { - let utf8 = self.config.get_utf8(); - Regex { prefilter: None, forward, reverse, utf8 } - } - - /// Apply the given regex configuration options to this builder. - pub fn configure(&mut self, config: Config) -> &mut Builder { - self.config = self.config.overwrite(config); - self + Regex { forward, reverse } } /// Set the syntax configuration for this builder using - /// [`SyntaxConfig`](crate::SyntaxConfig). + /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn syntax( &mut self, - config: crate::util::syntax::SyntaxConfig, + config: crate::util::syntax::Config, ) -> &mut Builder { self.dfa.syntax(config); self } /// Set the Thompson NFA configuration for this builder using - /// [`nfa::thompson::Config`](thompson::Config). + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like whether additional time should be /// spent shrinking the size of the NFA. - pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn thompson( + &mut self, + config: crate::nfa::thompson::Config, + ) -> &mut Builder { self.dfa.thompson(config); self } @@ -2117,30 +857,15 @@ impl Builder { /// /// This permits setting things like whether the underlying DFAs should /// be minimized. + #[cfg(feature = "dfa-build")] pub fn dense(&mut self, config: dense::Config) -> &mut Builder { self.dfa.configure(config); self } } -#[cfg(feature = "alloc")] impl Default for Builder { fn default() -> Builder { Builder::new() } } - -#[inline(always)] -fn next_unwrap( - item: Option<Result<MultiMatch, MatchError>>, -) -> Option<MultiMatch> { - match item { - None => None, - Some(Ok(m)) => Some(m), - Some(Err(err)) => panic!( - "unexpected regex search error: {}\n\ - to handle search errors, use try_ methods", - err, - ), - } -} diff --git a/vendor/regex-automata/src/dfa/remapper.rs b/vendor/regex-automata/src/dfa/remapper.rs new file mode 100644 index 000000000..6e4964672 --- /dev/null +++ b/vendor/regex-automata/src/dfa/remapper.rs @@ -0,0 +1,242 @@ +use alloc::vec::Vec; + +use crate::util::primitives::StateID; + +/// Remappable is a tightly coupled abstraction that facilitates remapping +/// state identifiers in DFAs. +/// +/// The main idea behind remapping state IDs is that DFAs often need to check +/// if a certain state is a "special" state of some kind (like a match state) +/// during a search. Since this is extremely perf critical code, we want this +/// check to be as fast as possible. Partitioning state IDs into, for example, +/// into "non-match" and "match" states means one can tell if a state is a +/// match state via a simple comparison of the state ID. +/// +/// The issue is that during the DFA construction process, it's not +/// particularly easy to partition the states. Instead, the simplest thing is +/// to often just do a pass over all of the states and shuffle them into their +/// desired partitionings. To do that, we need a mechanism for swapping states. +/// Hence, this abstraction. +/// +/// Normally, for such little code, I would just duplicate it. But this is a +/// key optimization and the implementation is a bit subtle. So the abstraction +/// is basically a ham-fisted attempt at DRY. The only place we use this is in +/// the dense and one-pass DFAs. +/// +/// See also src/dfa/special.rs for a more detailed explanation of how dense +/// DFAs are partitioned. +pub(super) trait Remappable: core::fmt::Debug { + /// Return the total number of states. + fn state_len(&self) -> usize; + /// Return the power-of-2 exponent that yields the stride. The pertinent + /// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride. + fn stride2(&self) -> usize; + /// Swap the states pointed to by the given IDs. The underlying finite + /// state machine should be mutated such that all of the transitions in + /// `id1` are now in the memory region where the transitions for `id2` + /// were, and all of the transitions in `id2` are now in the memory region + /// where the transitions for `id1` were. + /// + /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`. + /// + /// It is expected that, after calling this, the underlying value will be + /// left in an inconsistent state, since any other transitions pointing to, + /// e.g., `id1` need to be updated to point to `id2`, since that's where + /// `id1` moved to. + /// + /// In order to "fix" the underlying inconsistent state, a `Remapper` + /// should be used to guarantee that `remap` is called at the appropriate + /// time. + fn swap_states(&mut self, id1: StateID, id2: StateID); + /// This must remap every single state ID in the underlying value according + /// to the function given. For example, in a DFA, this should remap every + /// transition and every starting state ID. + fn remap(&mut self, map: impl Fn(StateID) -> StateID); +} + +/// Remapper is an abstraction the manages the remapping of state IDs in a +/// finite state machine. This is useful when one wants to shuffle states into +/// different positions in the machine. +/// +/// One of the key complexities this manages is the ability to correctly move +/// one state multiple times. +/// +/// Once shuffling is complete, `remap` must be called, which will rewrite +/// all pertinent transitions to updated state IDs. Neglecting to call `remap` +/// will almost certainly result in a corrupt machine. +#[derive(Debug)] +pub(super) struct Remapper { + /// A map from the index of a state to its pre-multiplied identifier. + /// + /// When a state is swapped with another, then their corresponding + /// locations in this map are also swapped. Thus, its new position will + /// still point to its old pre-multiplied StateID. + /// + /// While there is a bit more to it, this then allows us to rewrite the + /// state IDs in a DFA's transition table in a single pass. This is done + /// by iterating over every ID in this map, then iterating over each + /// transition for the state at that ID and re-mapping the transition from + /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position + /// in this map where `old_id` *started*, and set it to where it ended up + /// after all swaps have been completed. + map: Vec<StateID>, + /// A mapper from state index to state ID (and back). + idxmap: IndexMapper, +} + +impl Remapper { + /// Create a new remapper from the given remappable implementation. The + /// remapper can then be used to swap states. The remappable value given + /// here must the same one given to `swap` and `remap`. + pub(super) fn new(r: &impl Remappable) -> Remapper { + let idxmap = IndexMapper { stride2: r.stride2() }; + let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect(); + Remapper { map, idxmap } + } + + /// Swap two states. Once this is called, callers must follow through to + /// call `remap`, or else it's possible for the underlying remappable + /// value to be in a corrupt state. + pub(super) fn swap( + &mut self, + r: &mut impl Remappable, + id1: StateID, + id2: StateID, + ) { + if id1 == id2 { + return; + } + r.swap_states(id1, id2); + self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2)); + } + + /// Complete the remapping process by rewriting all state IDs in the + /// remappable value according to the swaps performed. + pub(super) fn remap(mut self, r: &mut impl Remappable) { + // Update the map to account for states that have been swapped + // multiple times. For example, if (A, C) and (C, G) are swapped, then + // transitions previously pointing to A should now point to G. But if + // we don't update our map, they will erroneously be set to C. All we + // do is follow the swaps in our map until we see our original state + // ID. + // + // The intuition here is to think about how changes are made to the + // map: only through pairwise swaps. That means that starting at any + // given state, it is always possible to find the loop back to that + // state by following the swaps represented in the map (which might be + // 0 swaps). + // + // We are also careful to clone the map before starting in order to + // freeze it. We use the frozen map to find our loops, since we need to + // update our map as well. Without freezing it, our updates could break + // the loops referenced above and produce incorrect results. + let oldmap = self.map.clone(); + for i in 0..r.state_len() { + let cur_id = self.idxmap.to_state_id(i); + let mut new_id = oldmap[i]; + if cur_id == new_id { + continue; + } + loop { + let id = oldmap[self.idxmap.to_index(new_id)]; + if cur_id == id { + self.map[i] = new_id; + break; + } + new_id = id; + } + } + r.remap(|next| self.map[self.idxmap.to_index(next)]); + } +} + +/// A simple type for mapping between state indices and state IDs. +/// +/// The reason why this exists is because state IDs are "premultiplied." That +/// is, in order to get to the transitions for a particular state, one need +/// only use the state ID as-is, instead of having to multiple it by transition +/// table's stride. +/// +/// The downside of this is that it's inconvenient to map between state IDs +/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like +/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`, +/// `2`, `3`, etc. +/// +/// Since our state IDs are premultiplied, we can convert back-and-forth +/// between IDs and indices by simply unmultiplying the IDs and multiplying the +/// indices. +#[derive(Debug)] +struct IndexMapper { + /// The power of 2 corresponding to the stride of the corresponding + /// transition table. 'id >> stride2' de-multiplies an ID while 'index << + /// stride2' pre-multiplies an index to an ID. + stride2: usize, +} + +impl IndexMapper { + /// Convert a state ID to a state index. + fn to_index(&self, id: StateID) -> usize { + id.as_usize() >> self.stride2 + } + + /// Convert a state index to a state ID. + fn to_state_id(&self, index: usize) -> StateID { + // CORRECTNESS: If the given index is not valid, then it is not + // required for this to panic or return a valid state ID. We'll "just" + // wind up with panics or silent logic errors at some other point. + StateID::new_unchecked(index << self.stride2) + } +} + +#[cfg(feature = "dfa-build")] +mod dense { + use crate::{dfa::dense::OwnedDFA, util::primitives::StateID}; + + use super::Remappable; + + impl Remappable for OwnedDFA { + fn state_len(&self) -> usize { + OwnedDFA::state_len(self) + } + + fn stride2(&self) -> usize { + OwnedDFA::stride2(self) + } + + fn swap_states(&mut self, id1: StateID, id2: StateID) { + OwnedDFA::swap_states(self, id1, id2) + } + + fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + OwnedDFA::remap(self, map) + } + } +} + +#[cfg(feature = "dfa-onepass")] +mod onepass { + use crate::{dfa::onepass::DFA, util::primitives::StateID}; + + use super::Remappable; + + impl Remappable for DFA { + fn state_len(&self) -> usize { + DFA::state_len(self) + } + + fn stride2(&self) -> usize { + // We don't do pre-multiplication for the one-pass DFA, so + // returning 0 has the effect of making state IDs and state indices + // equivalent. + 0 + } + + fn swap_states(&mut self, id1: StateID, id2: StateID) { + DFA::swap_states(self, id1, id2) + } + + fn remap(&mut self, map: impl Fn(StateID) -> StateID) { + DFA::remap(self, map) + } + } +} diff --git a/vendor/regex-automata/src/dfa/search.rs b/vendor/regex-automata/src/dfa/search.rs index 492414981..8c012a594 100644 --- a/vendor/regex-automata/src/dfa/search.rs +++ b/vendor/regex-automata/src/dfa/search.rs @@ -1,493 +1,654 @@ use crate::{ dfa::{ accel, - automaton::{Automaton, OverlappingState, StateMatch}, + automaton::{Automaton, OverlappingState}, }, util::{ - id::{PatternID, StateID}, - matchtypes::HalfMatch, - prefilter, MATCH_OFFSET, + prefilter::Prefilter, + primitives::StateID, + search::{Anchored, HalfMatch, Input, Span}, }, MatchError, }; #[inline(never)] -pub fn find_earliest_fwd<A: Automaton + ?Sized>( - pre: Option<&mut prefilter::Scanner>, +pub fn find_fwd<A: Automaton + ?Sized>( dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - // Searching with a pattern ID is always anchored, so we should never use - // a prefilter. - if pre.is_some() && pattern_id.is_none() { - find_fwd(pre, true, dfa, pattern_id, bytes, start, end) - } else { - find_fwd(None, true, dfa, pattern_id, bytes, start, end) + if input.is_done() { + return Ok(None); } -} - -#[inline(never)] -pub fn find_leftmost_fwd<A: Automaton + ?Sized>( - pre: Option<&mut prefilter::Scanner>, - dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, -) -> Result<Option<HalfMatch>, MatchError> { + let pre = if input.get_anchored().is_anchored() { + None + } else { + dfa.get_prefilter() + }; // Searching with a pattern ID is always anchored, so we should never use // a prefilter. - if pre.is_some() && pattern_id.is_none() { - find_fwd(pre, false, dfa, pattern_id, bytes, start, end) + if pre.is_some() { + if input.get_earliest() { + find_fwd_imp(dfa, input, pre, true) + } else { + find_fwd_imp(dfa, input, pre, false) + } } else { - find_fwd(None, false, dfa, pattern_id, bytes, start, end) + if input.get_earliest() { + find_fwd_imp(dfa, input, None, true) + } else { + find_fwd_imp(dfa, input, None, false) + } } } -/// This is marked as `inline(always)` specifically because it supports -/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters -/// getting inlined eliminate some critical branches. To avoid bloating binary -/// size, we only call this function in a fixed number of places. -#[inline(always)] -fn find_fwd<A: Automaton + ?Sized>( - mut pre: Option<&mut prefilter::Scanner>, - earliest: bool, +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_fwd_imp<A: Automaton + ?Sized>( dfa: &A, - pattern_id: Option<PatternID>, - haystack: &[u8], - start: usize, - end: usize, + input: &Input<'_>, + pre: Option<&'_ Prefilter>, + earliest: bool, ) -> Result<Option<HalfMatch>, MatchError> { - assert!(start <= end); - assert!(start <= haystack.len()); - assert!(end <= haystack.len()); - - // Why do this? This lets 'bytes[at]' work without bounds checks below. - // It seems the assert on 'end <= haystack.len()' above is otherwise - // not enough. Why not just make 'bytes' scoped this way anyway? Well, - // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' - // for resolving look-ahead. - let bytes = &haystack[..end]; + // See 'prefilter_restart' docs for explanation. + let universal_start = dfa.universal_start_state(Anchored::No).is_some(); + let mut mat = None; + let mut sid = init_fwd(dfa, input)?; + let mut at = input.start(); + // This could just be a closure, but then I think it would be unsound + // because it would need to be safe to invoke. This way, the lack of safety + // is clearer in the code below. + macro_rules! next_unchecked { + ($sid:expr, $at:expr) => {{ + let byte = *input.haystack().get_unchecked($at); + dfa.next_state_unchecked($sid, byte) + }}; + } - let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?; - let mut last_match = None; - let mut at = start; - if let Some(ref mut pre) = pre { + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); // If a prefilter doesn't report false positives, then we don't need to // touch the DFA at all. However, since all matches include the pattern // ID, and the prefilter infrastructure doesn't report pattern IDs, we // limit this optimization to cases where there is exactly one pattern. // In that case, any match must be the 0th pattern. - if dfa.pattern_count() == 1 && !pre.reports_false_positives() { - return Ok(pre.next_candidate(bytes, at).into_option().map( - |offset| HalfMatch { pattern: PatternID::ZERO, offset }, - )); - } else if pre.is_effective(at) { - match pre.next_candidate(bytes, at).into_option() { - None => return Ok(None), - Some(i) => { - at = i; + match pre.find(input.haystack(), span) { + None => return Ok(mat), + Some(ref span) => { + at = span.start; + if !universal_start { + sid = prefilter_restart(dfa, &input, at)?; } } } } - while at < end { - let byte = bytes[at]; - state = dfa.next_state(state, byte); - at += 1; - if dfa.is_special_state(state) { - if dfa.is_start_state(state) { - if let Some(ref mut pre) = pre { - if pre.is_effective(at) { - match pre.next_candidate(bytes, at).into_option() { - None => return Ok(None), - Some(i) => { - at = i; + while at < input.end() { + // SAFETY: There are two safety invariants we need to uphold here in + // the loops below: that 'sid' and 'prev_sid' are valid state IDs + // for this DFA, and that 'at' is a valid index into 'haystack'. + // For the former, we rely on the invariant that next_state* and + // start_state_forward always returns a valid state ID (given a valid + // state ID in the former case). For the latter safety invariant, we + // always guard unchecked access with a check that 'at' is less than + // 'end', where 'end <= haystack.len()'. In the unrolled loop below, we + // ensure that 'at' is always in bounds. + // + // PERF: See a similar comment in src/hybrid/search.rs that justifies + // this extra work to make the search loop fast. The same reasoning and + // benchmarks apply here. + let mut prev_sid; + while at < input.end() { + prev_sid = unsafe { next_unchecked!(sid, at) }; + if dfa.is_special_state(prev_sid) || at + 3 >= input.end() { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at += 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if dfa.is_special_state(sid) { + break; + } + at += 1; + + prev_sid = unsafe { next_unchecked!(sid, at) }; + if dfa.is_special_state(prev_sid) { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at += 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if dfa.is_special_state(sid) { + break; + } + at += 1; + } + if dfa.is_special_state(sid) { + if dfa.is_start_state(sid) { + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => return Ok(mat), + Some(ref span) => { + // We want to skip any update to 'at' below + // at the end of this iteration and just + // jump immediately back to the next state + // transition at the leading position of the + // candidate match. + // + // ... but only if we actually made progress + // with our prefilter, otherwise if the start + // state has a self-loop, we can get stuck. + if span.start > at { + at = span.start; + if !universal_start { + sid = prefilter_restart(dfa, &input, at)?; + } + continue; } } } - } else if dfa.is_accel_state(state) { - let needles = dfa.accelerator(state); - at = accel::find_fwd(needles, bytes, at) - .unwrap_or(bytes.len()); + } else if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + at = accel::find_fwd(needles, input.haystack(), at + 1) + .unwrap_or(input.end()); + continue; } - } else if dfa.is_match_state(state) { - last_match = Some(HalfMatch { - pattern: dfa.match_pattern(state, 0), - offset: at - MATCH_OFFSET, - }); + } else if dfa.is_match_state(sid) { + let pattern = dfa.match_pattern(sid, 0); + mat = Some(HalfMatch::new(pattern, at)); if earliest { - return Ok(last_match); + return Ok(mat); } - if dfa.is_accel_state(state) { - let needles = dfa.accelerator(state); - at = accel::find_fwd(needles, bytes, at) - .unwrap_or(bytes.len()); + if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + at = accel::find_fwd(needles, input.haystack(), at + 1) + .unwrap_or(input.end()); + continue; } - } else if dfa.is_accel_state(state) { - let needs = dfa.accelerator(state); - at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); - } else if dfa.is_dead_state(state) { - return Ok(last_match); + } else if dfa.is_accel_state(sid) { + let needs = dfa.accelerator(sid); + at = accel::find_fwd(needs, input.haystack(), at + 1) + .unwrap_or(input.end()); + continue; + } else if dfa.is_dead_state(sid) { + return Ok(mat); } else { - debug_assert!(dfa.is_quit_state(state)); - if last_match.is_some() { - return Ok(last_match); - } - return Err(MatchError::Quit { byte, offset: at - 1 }); + // It's important that this is a debug_assert, since this can + // actually be tripped even if DFA::from_bytes succeeds and + // returns a supposedly valid DFA. + debug_assert!(dfa.is_quit_state(sid)); + return Err(MatchError::quit(input.haystack()[at], at)); } } - while at < end && dfa.next_state(state, bytes[at]) == state { - at += 1; - } + at += 1; } - Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match)) + eoi_fwd(dfa, input, &mut sid, &mut mat)?; + Ok(mat) } #[inline(never)] -pub fn find_earliest_rev<A: Automaton + ?Sized>( +pub fn find_rev<A: Automaton + ?Sized>( dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - find_rev(true, dfa, pattern_id, bytes, start, end) + if input.is_done() { + return Ok(None); + } + if input.get_earliest() { + find_rev_imp(dfa, input, true) + } else { + find_rev_imp(dfa, input, false) + } } -#[inline(never)] -pub fn find_leftmost_rev<A: Automaton + ?Sized>( +#[cfg_attr(feature = "perf-inline", inline(always))] +fn find_rev_imp<A: Automaton + ?Sized>( dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, -) -> Result<Option<HalfMatch>, MatchError> { - find_rev(false, dfa, pattern_id, bytes, start, end) -} - -/// This is marked as `inline(always)` specifically because it supports -/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined -/// permits eliminating a few crucial branches. -#[inline(always)] -fn find_rev<A: Automaton + ?Sized>( + input: &Input<'_>, earliest: bool, - dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, ) -> Result<Option<HalfMatch>, MatchError> { - assert!(start <= end); - assert!(start <= bytes.len()); - assert!(end <= bytes.len()); + let mut mat = None; + let mut sid = init_rev(dfa, input)?; + // In reverse search, the loop below can't handle the case of searching an + // empty slice. Ideally we could write something congruent to the forward + // search, i.e., 'while at >= start', but 'start' might be 0. Since we use + // an unsigned offset, 'at >= 0' is trivially always true. We could avoid + // this extra case handling by using a signed offset, but Rust makes it + // annoying to do. So... We just handle the empty case separately. + if input.start() == input.end() { + eoi_rev(dfa, input, &mut sid, &mut mat)?; + return Ok(mat); + } - let mut state = init_rev(dfa, pattern_id, bytes, start, end)?; - let mut last_match = None; - let mut at = end; - while at > start { - at -= 1; - while at > start && dfa.next_state(state, bytes[at]) == state { + let mut at = input.end() - 1; + macro_rules! next_unchecked { + ($sid:expr, $at:expr) => {{ + let byte = *input.haystack().get_unchecked($at); + dfa.next_state_unchecked($sid, byte) + }}; + } + loop { + // SAFETY: See comments in 'find_fwd' for a safety argument. + let mut prev_sid; + while at >= input.start() { + prev_sid = unsafe { next_unchecked!(sid, at) }; + if dfa.is_special_state(prev_sid) + || at <= input.start().saturating_add(3) + { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at -= 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if dfa.is_special_state(sid) { + break; + } at -= 1; - } - let byte = bytes[at]; - state = dfa.next_state(state, byte); - if dfa.is_special_state(state) { - if dfa.is_start_state(state) { - if dfa.is_accel_state(state) { - let needles = dfa.accelerator(state); - at = accel::find_rev(needles, bytes, at) + prev_sid = unsafe { next_unchecked!(sid, at) }; + if dfa.is_special_state(prev_sid) { + core::mem::swap(&mut prev_sid, &mut sid); + break; + } + at -= 1; + + sid = unsafe { next_unchecked!(prev_sid, at) }; + if dfa.is_special_state(sid) { + break; + } + at -= 1; + } + if dfa.is_special_state(sid) { + if dfa.is_start_state(sid) { + if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + at = accel::find_rev(needles, input.haystack(), at) .map(|i| i + 1) - .unwrap_or(0); + .unwrap_or(input.start()); } - } else if dfa.is_match_state(state) { - last_match = Some(HalfMatch { - pattern: dfa.match_pattern(state, 0), - offset: at + MATCH_OFFSET, - }); + } else if dfa.is_match_state(sid) { + let pattern = dfa.match_pattern(sid, 0); + // Since reverse searches report the beginning of a match + // and the beginning is inclusive (not exclusive like the + // end of a match), we add 1 to make it inclusive. + mat = Some(HalfMatch::new(pattern, at + 1)); if earliest { - return Ok(last_match); + return Ok(mat); } - if dfa.is_accel_state(state) { - let needles = dfa.accelerator(state); - at = accel::find_rev(needles, bytes, at) + if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + at = accel::find_rev(needles, input.haystack(), at) .map(|i| i + 1) - .unwrap_or(0); + .unwrap_or(input.start()); } - } else if dfa.is_accel_state(state) { - let needles = dfa.accelerator(state); - at = accel::find_rev(needles, bytes, at) + } else if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + // If the accelerator returns nothing, why don't we quit the + // search? Well, if the accelerator doesn't find anything, that + // doesn't mean we don't have a match. It just means that we + // can't leave the current state given one of the 255 possible + // byte values. However, there might be an EOI transition. So + // we set 'at' to the end of the haystack, which will cause + // this loop to stop and fall down into the EOI transition. + at = accel::find_rev(needles, input.haystack(), at) .map(|i| i + 1) - .unwrap_or(0); - } else if dfa.is_dead_state(state) { - return Ok(last_match); + .unwrap_or(input.start()); + } else if dfa.is_dead_state(sid) { + return Ok(mat); } else { - debug_assert!(dfa.is_quit_state(state)); - if last_match.is_some() { - return Ok(last_match); - } - return Err(MatchError::Quit { byte, offset: at }); + debug_assert!(dfa.is_quit_state(sid)); + return Err(MatchError::quit(input.haystack()[at], at)); } } + if at == input.start() { + break; + } + at -= 1; } - Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match)) + eoi_rev(dfa, input, &mut sid, &mut mat)?; + Ok(mat) } #[inline(never)] pub fn find_overlapping_fwd<A: Automaton + ?Sized>( - pre: Option<&mut prefilter::Scanner>, dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - caller_state: &mut OverlappingState, -) -> Result<Option<HalfMatch>, MatchError> { - // Searching with a pattern ID is always anchored, so we should only ever - // use a prefilter when no pattern ID is given. - if pre.is_some() && pattern_id.is_none() { - find_overlapping_fwd_imp( - pre, - dfa, - pattern_id, - bytes, - start, - end, - caller_state, - ) + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + let pre = if input.get_anchored().is_anchored() { + None + } else { + dfa.get_prefilter() + }; + if pre.is_some() { + find_overlapping_fwd_imp(dfa, input, pre, state) } else { - find_overlapping_fwd_imp( - None, - dfa, - pattern_id, - bytes, - start, - end, - caller_state, - ) + find_overlapping_fwd_imp(dfa, input, None, state) } } -/// This is marked as `inline(always)` specifically because it supports -/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined -/// permits eliminating a few crucial branches and reduces code size when it is -/// not used. -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] fn find_overlapping_fwd_imp<A: Automaton + ?Sized>( - mut pre: Option<&mut prefilter::Scanner>, dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - mut start: usize, - end: usize, - caller_state: &mut OverlappingState, -) -> Result<Option<HalfMatch>, MatchError> { - assert!(start <= end); - assert!(start <= bytes.len()); - assert!(end <= bytes.len()); - - let mut state = match caller_state.id() { - None => init_fwd(dfa, pattern_id, bytes, start, end)?, - Some(id) => { - if let Some(last) = caller_state.last_match() { - let match_count = dfa.match_count(id); - if last.match_index < match_count { - let m = HalfMatch { - pattern: dfa.match_pattern(id, last.match_index), - offset: last.offset, - }; - last.match_index += 1; - return Ok(Some(m)); + input: &Input<'_>, + pre: Option<&'_ Prefilter>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + // See 'prefilter_restart' docs for explanation. + let universal_start = dfa.universal_start_state(Anchored::No).is_some(); + let mut sid = match state.id { + None => { + state.at = input.start(); + init_fwd(dfa, input)? + } + Some(sid) => { + if let Some(match_index) = state.next_match_index { + let match_len = dfa.match_len(sid); + if match_index < match_len { + state.next_match_index = Some(match_index + 1); + let pattern = dfa.match_pattern(sid, match_index); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); } } - - // This is a subtle but critical detail. If the caller provides a - // non-None state ID, then it must be the case that the state ID - // corresponds to one set by this function. The state ID therefore - // corresponds to a match state, a dead state or some other state. - // However, "some other" state _only_ occurs when the input has - // been exhausted because the only way to stop before then is to - // see a match or a dead/quit state. - // - // If the input is exhausted or if it's a dead state, then - // incrementing the starting position has no relevance on - // correctness, since the loop below will either not execute - // at all or will immediately stop due to being in a dead state. - // (Once in a dead state it is impossible to leave it.) - // - // Therefore, the only case we need to consider is when - // caller_state is a match state. In this case, since our machines - // support the ability to delay a match by a certain number of - // bytes (to support look-around), it follows that we actually - // consumed that many additional bytes on our previous search. When - // the caller resumes their search to find subsequent matches, they - // will use the ending location from the previous match as the next - // starting point, which is `MATCH_OFFSET` bytes PRIOR to where - // we scanned to on the previous search. Therefore, we need to - // compensate by bumping `start` up by `MATCH_OFFSET` bytes. - // - // Incidentally, since MATCH_OFFSET is non-zero, this also makes - // dealing with empty matches convenient. Namely, callers needn't - // special case them when implementing an iterator. Instead, this - // ensures that forward progress is always made. - start += MATCH_OFFSET; - id + // Once we've reported all matches at a given position, we need to + // advance the search to the next position. + state.at += 1; + if state.at > input.end() { + return Ok(()); + } + sid } }; - let mut at = start; - while at < end { - let byte = bytes[at]; - state = dfa.next_state(state, byte); - at += 1; - if dfa.is_special_state(state) { - caller_state.set_id(state); - if dfa.is_start_state(state) { - if let Some(ref mut pre) = pre { - if pre.is_effective(at) { - match pre.next_candidate(bytes, at).into_option() { - None => return Ok(None), - Some(i) => { - at = i; + // NOTE: We don't optimize the crap out of this routine primarily because + // it seems like most find_overlapping searches will have higher match + // counts, and thus, throughput is perhaps not as important. But if you + // have a use case for something faster, feel free to file an issue. + while state.at < input.end() { + sid = dfa.next_state(sid, input.haystack()[state.at]); + if dfa.is_special_state(sid) { + state.id = Some(sid); + if dfa.is_start_state(sid) { + if let Some(ref pre) = pre { + let span = Span::from(state.at..input.end()); + match pre.find(input.haystack(), span) { + None => return Ok(()), + Some(ref span) => { + if span.start > state.at { + state.at = span.start; + if !universal_start { + sid = prefilter_restart( + dfa, &input, state.at, + )?; + } + continue; } } } - } else if dfa.is_accel_state(state) { - let needles = dfa.accelerator(state); - at = accel::find_fwd(needles, bytes, at) - .unwrap_or(bytes.len()); + } else if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + state.at = accel::find_fwd( + needles, + input.haystack(), + state.at + 1, + ) + .unwrap_or(input.end()); + continue; + } + } else if dfa.is_match_state(sid) { + state.next_match_index = Some(1); + let pattern = dfa.match_pattern(sid, 0); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); + } else if dfa.is_accel_state(sid) { + let needs = dfa.accelerator(sid); + // If the accelerator returns nothing, why don't we quit the + // search? Well, if the accelerator doesn't find anything, that + // doesn't mean we don't have a match. It just means that we + // can't leave the current state given one of the 255 possible + // byte values. However, there might be an EOI transition. So + // we set 'at' to the end of the haystack, which will cause + // this loop to stop and fall down into the EOI transition. + state.at = + accel::find_fwd(needs, input.haystack(), state.at + 1) + .unwrap_or(input.end()); + continue; + } else if dfa.is_dead_state(sid) { + return Ok(()); + } else { + debug_assert!(dfa.is_quit_state(sid)); + return Err(MatchError::quit( + input.haystack()[state.at], + state.at, + )); + } + } + state.at += 1; + } + + let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat); + state.id = Some(sid); + if state.mat.is_some() { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + state.next_match_index = Some(1); + } + result +} + +#[inline(never)] +pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + state: &mut OverlappingState, +) -> Result<(), MatchError> { + state.mat = None; + if input.is_done() { + return Ok(()); + } + let mut sid = match state.id { + None => { + let sid = init_rev(dfa, input)?; + state.id = Some(sid); + if input.start() == input.end() { + state.rev_eoi = true; + } else { + state.at = input.end() - 1; + } + sid + } + Some(sid) => { + if let Some(match_index) = state.next_match_index { + let match_len = dfa.match_len(sid); + if match_index < match_len { + state.next_match_index = Some(match_index + 1); + let pattern = dfa.match_pattern(sid, match_index); + state.mat = Some(HalfMatch::new(pattern, state.at)); + return Ok(()); + } + } + // Once we've reported all matches at a given position, we need + // to advance the search to the next position. However, if we've + // already followed the EOI transition, then we know we're done + // with the search and there cannot be any more matches to report. + if state.rev_eoi { + return Ok(()); + } else if state.at == input.start() { + // At this point, we should follow the EOI transition. This + // will cause us the skip the main loop below and fall through + // to the final 'eoi_rev' transition. + state.rev_eoi = true; + } else { + // We haven't hit the end of the search yet, so move on. + state.at -= 1; + } + sid + } + }; + while !state.rev_eoi { + sid = dfa.next_state(sid, input.haystack()[state.at]); + if dfa.is_special_state(sid) { + state.id = Some(sid); + if dfa.is_start_state(sid) { + if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + state.at = + accel::find_rev(needles, input.haystack(), state.at) + .map(|i| i + 1) + .unwrap_or(input.start()); } - } else if dfa.is_match_state(state) { - let offset = at - MATCH_OFFSET; - caller_state - .set_last_match(StateMatch { match_index: 1, offset }); - return Ok(Some(HalfMatch { - pattern: dfa.match_pattern(state, 0), - offset, - })); - } else if dfa.is_accel_state(state) { - let needs = dfa.accelerator(state); - at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); - } else if dfa.is_dead_state(state) { - return Ok(None); + } else if dfa.is_match_state(sid) { + state.next_match_index = Some(1); + let pattern = dfa.match_pattern(sid, 0); + state.mat = Some(HalfMatch::new(pattern, state.at + 1)); + return Ok(()); + } else if dfa.is_accel_state(sid) { + let needles = dfa.accelerator(sid); + // If the accelerator returns nothing, why don't we quit the + // search? Well, if the accelerator doesn't find anything, that + // doesn't mean we don't have a match. It just means that we + // can't leave the current state given one of the 255 possible + // byte values. However, there might be an EOI transition. So + // we set 'at' to the end of the haystack, which will cause + // this loop to stop and fall down into the EOI transition. + state.at = + accel::find_rev(needles, input.haystack(), state.at) + .map(|i| i + 1) + .unwrap_or(input.start()); + } else if dfa.is_dead_state(sid) { + return Ok(()); } else { - debug_assert!(dfa.is_quit_state(state)); - return Err(MatchError::Quit { byte, offset: at - 1 }); + debug_assert!(dfa.is_quit_state(sid)); + return Err(MatchError::quit( + input.haystack()[state.at], + state.at, + )); } } + if state.at == input.start() { + break; + } + state.at -= 1; } - let result = eoi_fwd(dfa, bytes, end, &mut state); - caller_state.set_id(state); - if let Ok(Some(ref last_match)) = result { - caller_state.set_last_match(StateMatch { - match_index: 1, - offset: last_match.offset(), - }); + let result = eoi_rev(dfa, input, &mut sid, &mut state.mat); + state.rev_eoi = true; + state.id = Some(sid); + if state.mat.is_some() { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + state.next_match_index = Some(1); } result } +#[cfg_attr(feature = "perf-inline", inline(always))] fn init_fwd<A: Automaton + ?Sized>( dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<StateID, MatchError> { - let state = dfa.start_state_forward(pattern_id, bytes, start, end); + let sid = dfa.start_state_forward(input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. - assert!(!dfa.is_match_state(state)); - Ok(state) + debug_assert!(!dfa.is_match_state(sid)); + Ok(sid) } +#[cfg_attr(feature = "perf-inline", inline(always))] fn init_rev<A: Automaton + ?Sized>( dfa: &A, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, + input: &Input<'_>, ) -> Result<StateID, MatchError> { - let state = dfa.start_state_reverse(pattern_id, bytes, start, end); + let sid = dfa.start_state_reverse(input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. - assert!(!dfa.is_match_state(state)); - Ok(state) + debug_assert!(!dfa.is_match_state(sid)); + Ok(sid) } +#[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_fwd<A: Automaton + ?Sized>( dfa: &A, - bytes: &[u8], - end: usize, - state: &mut StateID, -) -> Result<Option<HalfMatch>, MatchError> { - match bytes.get(end) { + input: &Input<'_>, + sid: &mut StateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + match input.haystack().get(sp.end) { Some(&b) => { - *state = dfa.next_state(*state, b); - if dfa.is_match_state(*state) { - Ok(Some(HalfMatch { - pattern: dfa.match_pattern(*state, 0), - offset: end, - })) - } else { - Ok(None) + *sid = dfa.next_state(*sid, b); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.end)); + } else if dfa.is_quit_state(*sid) { + return Err(MatchError::quit(b, sp.end)); } } None => { - *state = dfa.next_eoi_state(*state); - if dfa.is_match_state(*state) { - Ok(Some(HalfMatch { - pattern: dfa.match_pattern(*state, 0), - offset: bytes.len(), - })) - } else { - Ok(None) + *sid = dfa.next_eoi_state(*sid); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!dfa.is_quit_state(*sid)); } } + Ok(()) } +#[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_rev<A: Automaton + ?Sized>( dfa: &A, - bytes: &[u8], - start: usize, - state: StateID, -) -> Result<Option<HalfMatch>, MatchError> { - if start > 0 { - let state = dfa.next_state(state, bytes[start - 1]); - if dfa.is_match_state(state) { - Ok(Some(HalfMatch { - pattern: dfa.match_pattern(state, 0), - offset: start, - })) - } else { - Ok(None) + input: &Input<'_>, + sid: &mut StateID, + mat: &mut Option<HalfMatch>, +) -> Result<(), MatchError> { + let sp = input.get_span(); + if sp.start > 0 { + let byte = input.haystack()[sp.start - 1]; + *sid = dfa.next_state(*sid, byte); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, sp.start)); + } else if dfa.is_quit_state(*sid) { + return Err(MatchError::quit(byte, sp.start - 1)); } } else { - let state = dfa.next_eoi_state(state); - if dfa.is_match_state(state) { - Ok(Some(HalfMatch { - pattern: dfa.match_pattern(state, 0), - offset: 0, - })) - } else { - Ok(None) + *sid = dfa.next_eoi_state(*sid); + if dfa.is_match_state(*sid) { + let pattern = dfa.match_pattern(*sid, 0); + *mat = Some(HalfMatch::new(pattern, 0)); } + // N.B. We don't have to check 'is_quit' here because the EOI + // transition can never lead to a quit state. + debug_assert!(!dfa.is_quit_state(*sid)); } + Ok(()) } -// Currently unused, but is useful to keep around. This was originally used -// when the code above used raw pointers for its main loop. -// /// Returns the distance between the given pointer and the start of `bytes`. -// /// This assumes that the given pointer points to somewhere in the `bytes` -// /// slice given. -// fn offset(bytes: &[u8], p: *const u8) -> usize { -// debug_assert!(bytes.as_ptr() <= p); -// debug_assert!(bytes[bytes.len()..].as_ptr() >= p); -// ((p as isize) - (bytes.as_ptr() as isize)) as usize -// } +/// Re-compute the starting state that a DFA should be in after finding a +/// prefilter candidate match at the position `at`. +/// +/// The function with the same name has a bit more docs in hybrid/search.rs. +#[cfg_attr(feature = "perf-inline", inline(always))] +fn prefilter_restart<A: Automaton + ?Sized>( + dfa: &A, + input: &Input<'_>, + at: usize, +) -> Result<StateID, MatchError> { + let mut input = input.clone(); + input.set_start(at); + init_fwd(dfa, &input) +} diff --git a/vendor/regex-automata/src/dfa/search_unsafe.rs b/vendor/regex-automata/src/dfa/search_unsafe.rs deleted file mode 100644 index ea1c29ff7..000000000 --- a/vendor/regex-automata/src/dfa/search_unsafe.rs +++ /dev/null @@ -1,321 +0,0 @@ -use crate::dfa::automaton::{Automaton, State}; -use crate::MatchError; - -/// This is marked as `inline(always)` specifically because it supports -/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined -/// permits eliminating a few crucial branches. -#[inline(always)] -pub fn find_fwd<A: Automaton + ?Sized>( - dfa: &A, - bytes: &[u8], - start: usize, - end: usize, - earliest: bool, -) -> Result<Option<usize>, MatchError> { - assert!(start <= end); - assert!(start <= bytes.len()); - assert!(end <= bytes.len()); - - let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?; - if earliest && last_match.is_some() { - return Ok(last_match); - } - - let mut at = start; - while at < end { - let byte = bytes[at]; - state = dfa.next_state(state, byte); - at += 1; - if dfa.is_special_state(state) { - if dfa.is_dead_state(state) { - return Ok(last_match); - } else if dfa.is_quit_state(state) { - return Err(MatchError::Quit { byte, offset: at - 1 }); - } - last_match = Some(at - dfa.match_offset()); - if earliest { - return Ok(last_match); - } - } - } - /* - unsafe { - let mut p = bytes.as_ptr().add(start); - while p < bytes[end..].as_ptr() { - let byte = *p; - state = dfa.next_state_unchecked(state, byte); - p = p.add(1); - if dfa.is_special_state(state) { - if dfa.is_dead_state(state) { - return Ok(last_match); - } else if dfa.is_quit_state(state) { - return Err(MatchError::Quit { - byte, - offset: offset(bytes, p) - 1, - }); - } - last_match = Some(offset(bytes, p) - dfa.match_offset()); - if earliest { - return Ok(last_match); - } - } - } - } - */ - Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match)) -} - -/// This is marked as `inline(always)` specifically because it supports -/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined -/// permits eliminating a few crucial branches. -#[inline(always)] -pub fn find_rev<A: Automaton + ?Sized>( - dfa: &A, - bytes: &[u8], - start: usize, - end: usize, - earliest: bool, -) -> Result<Option<usize>, MatchError> { - assert!(start <= end); - assert!(start <= bytes.len()); - assert!(end <= bytes.len()); - - let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?; - if earliest && last_match.is_some() { - return Ok(last_match); - } - - let mut at = end; - while at > start { - at -= 1; - let byte = bytes[at]; - state = dfa.next_state(state, byte); - if dfa.is_special_state(state) { - if dfa.is_dead_state(state) { - return Ok(last_match); - } else if dfa.is_quit_state(state) { - return Err(MatchError::Quit { byte, offset: at }); - } - last_match = Some(at + dfa.match_offset()); - if earliest { - return Ok(last_match); - } - } - } - /* - unsafe { - let mut p = bytes.as_ptr().add(end); - while p > bytes[start..].as_ptr() { - p = p.sub(1); - let byte = *p; - state = dfa.next_state_unchecked(state, byte); - if dfa.is_special_state(state) { - if dfa.is_dead_state(state) { - return Ok(last_match); - } else if dfa.is_quit_state(state) { - return Err(MatchError::Quit { - byte, - offset: offset(bytes, p), - }); - } - last_match = Some(offset(bytes, p) + dfa.match_offset()); - if earliest { - return Ok(last_match); - } - } - } - } - */ - Ok(eof_rev(dfa, state, bytes, start)?.or(last_match)) -} - -pub fn find_overlapping_fwd<A: Automaton + ?Sized>( - dfa: &A, - bytes: &[u8], - mut start: usize, - end: usize, - caller_state: &mut State<A::ID>, -) -> Result<Option<usize>, MatchError> { - assert!(start <= end); - assert!(start <= bytes.len()); - assert!(end <= bytes.len()); - - let (mut state, mut last_match) = match caller_state.as_option() { - None => init_fwd(dfa, bytes, start, end)?, - Some(id) => { - // This is a subtle but critical detail. If the caller provides a - // non-None state ID, then it must be the case that the state ID - // corresponds to one set by this function. The state ID therefore - // corresponds to a match state, a dead state or some other state. - // However, "some other" state _only_ occurs when the input has - // been exhausted because the only way to stop before then is to - // see a match or a dead/quit state. - // - // If the input is exhausted or if it's a dead state, then - // incrementing the starting position has no relevance on - // correctness, since the loop below will either not execute - // at all or will immediately stop due to being in a dead state. - // (Once in a dead state it is impossible to leave it.) - // - // Therefore, the only case we need to consider is when - // caller_state is a match state. In this case, since our machines - // support the ability to delay a match by a certain number of - // bytes (to support look-around), it follows that we actually - // consumed that many additional bytes on our previous search. When - // the caller resumes their search to find subsequent matches, they - // will use the ending location from the previous match as the next - // starting point, which is `match_offset` bytes PRIOR to where - // we scanned to on the previous search. Therefore, we need to - // compensate by bumping `start` up by `match_offset` bytes. - start += dfa.match_offset(); - // Since match_offset could be any arbitrary value and we use - // `start` in pointer arithmetic below, we check that we are still - // in bounds. Otherwise, we could materialize a pointer that is - // more than one past the end point of `bytes`, which is UB. - if start > end { - return Ok(None); - } - (id, None) - } - }; - if last_match.is_some() { - caller_state.set(state); - return Ok(last_match); - } - - let mut at = start; - while at < end { - let byte = bytes[at]; - state = dfa.next_state(state, byte); - at += 1; - if dfa.is_special_state(state) { - caller_state.set(state); - if dfa.is_dead_state(state) { - return Ok(None); - } else if dfa.is_quit_state(state) { - return Err(MatchError::Quit { byte, offset: at - 1 }); - } else { - return Ok(Some(at - dfa.match_offset())); - } - } - } - /* - // SAFETY: Other than the normal pointer arithmetic happening here, a - // unique aspect of safety for this function is the fact that the caller - // can provide the state that the search routine will start with. If this - // state were invalid, it would be possible to incorrectly index the - // transition table. We however prevent this from happening by guaranteeing - // that State is valid. Namely, callers cannot mutate a State. All they can - // do is create a "start" state or otherwise reuse a previously set state. - // Since callers can't mutate a state, it follows that a previously set - // state can only be retrieved by crate internal functions. Therefore, our - // use of it is safe since this code will only ever set the provided state - // to a valid state. - unsafe { - let mut p = bytes.as_ptr().add(start); - while p < bytes[end..].as_ptr() { - let byte = *p; - state = dfa.next_state_unchecked(state, byte); - p = p.add(1); - if dfa.is_special_state(state) { - caller_state.set(state); - return if dfa.is_dead_state(state) { - Ok(None) - } else if dfa.is_quit_state(state) { - Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 }) - } else { - Ok(Some(offset(bytes, p) - dfa.match_offset())) - }; - } - } - } - */ - - let result = eof_fwd(dfa, bytes, end, &mut state); - caller_state.set(state); - result -} - -fn init_fwd<A: Automaton + ?Sized>( - dfa: &A, - bytes: &[u8], - start: usize, - end: usize, -) -> Result<(A::ID, Option<usize>), MatchError> { - let state = dfa.start_state_forward(bytes, start, end); - if dfa.is_match_state(state) { - Ok((state, Some(start - dfa.match_offset()))) - } else { - Ok((state, None)) - } -} - -fn init_rev<A: Automaton + ?Sized>( - dfa: &A, - bytes: &[u8], - start: usize, - end: usize, -) -> Result<(A::ID, Option<usize>), MatchError> { - let state = dfa.start_state_reverse(bytes, start, end); - if dfa.is_match_state(state) { - Ok((state, Some(end + dfa.match_offset()))) - } else { - Ok((state, None)) - } -} - -fn eof_fwd<A: Automaton + ?Sized>( - dfa: &A, - bytes: &[u8], - end: usize, - state: &mut A::ID, -) -> Result<Option<usize>, MatchError> { - match bytes.get(end) { - Some(&b) => { - *state = dfa.next_state(*state, b); - if dfa.is_match_state(*state) { - Ok(Some(end)) - } else { - Ok(None) - } - } - None => { - *state = dfa.next_eof_state(*state); - if dfa.is_match_state(*state) { - Ok(Some(bytes.len())) - } else { - Ok(None) - } - } - } -} - -fn eof_rev<A: Automaton + ?Sized>( - dfa: &A, - state: A::ID, - bytes: &[u8], - start: usize, -) -> Result<Option<usize>, MatchError> { - if start > 0 { - if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) { - Ok(Some(start)) - } else { - Ok(None) - } - } else { - if dfa.is_match_state(dfa.next_eof_state(state)) { - Ok(Some(0)) - } else { - Ok(None) - } - } -} - -/// Returns the distance between the given pointer and the start of `bytes`. -/// This assumes that the given pointer points to somewhere in the `bytes` -/// slice given. -fn offset(bytes: &[u8], p: *const u8) -> usize { - debug_assert!(bytes.as_ptr() <= p); - debug_assert!(bytes[bytes.len()..].as_ptr() >= p); - ((p as isize) - (bytes.as_ptr() as isize)) as usize -} diff --git a/vendor/regex-automata/src/dfa/sparse.rs b/vendor/regex-automata/src/dfa/sparse.rs index 346606987..5d8ec2340 100644 --- a/vendor/regex-automata/src/dfa/sparse.rs +++ b/vendor/regex-automata/src/dfa/sparse.rs @@ -14,7 +14,7 @@ example, this configures a sparse DFA to do an overlapping search: ``` use regex_automata::{ dfa::{Automaton, OverlappingState, dense}, - HalfMatch, MatchKind, + HalfMatch, Input, MatchKind, }; let dense_re = dense::Builder::new() @@ -23,25 +23,21 @@ let dense_re = dense::Builder::new() let sparse_re = dense_re.to_sparse()?; // Setup our haystack and initial start state. -let haystack = b"Samwise"; +let input = Input::new("Samwise"); let mut state = OverlappingState::start(); // First, 'Sam' will match. -let end1 = sparse_re.find_overlapping_fwd_at( - None, None, haystack, 0, haystack.len(), &mut state, -)?; -assert_eq!(end1, Some(HalfMatch::must(0, 3))); +sparse_re.try_search_overlapping_fwd(&input, &mut state)?; +assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match()); // And now 'Samwise' will match. -let end2 = sparse_re.find_overlapping_fwd_at( - None, None, haystack, 3, haystack.len(), &mut state, -)?; -assert_eq!(end2, Some(HalfMatch::must(0, 7))); +sparse_re.try_search_overlapping_fwd(&input, &mut state)?; +assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); # Ok::<(), Box<dyn std::error::Error>>(()) ``` */ -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] use core::iter; use core::{ convert::{TryFrom, TryInto}, @@ -49,23 +45,27 @@ use core::{ mem::size_of, }; -#[cfg(feature = "alloc")] -use alloc::{collections::BTreeSet, vec, vec::Vec}; +#[cfg(feature = "dfa-build")] +use alloc::{vec, vec::Vec}; -#[cfg(feature = "alloc")] -use crate::dfa::{dense, error::Error}; +#[cfg(feature = "dfa-build")] +use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ automaton::{fmt_state_indicator, Automaton}, + dense::Flags, special::Special, - DEAD, + StartKind, DEAD, }, util::{ - alphabet::ByteClasses, - bytes::{self, DeserializeError, Endian, SerializeError}, - id::{PatternID, StateID}, - start::Start, - DebugByte, + alphabet::{ByteClasses, ByteSet}, + escape::DebugByte, + int::{Pointer, Usize, U16, U32}, + prefilter::Prefilter, + primitives::{PatternID, StateID}, + search::{Anchored, Input, MatchError}, + start::{Start, StartByteMap}, + wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -107,14 +107,11 @@ const VERSION: u32 = 2; /// for searching. For example: /// /// ``` -/// use regex_automata::{ -/// dfa::{Automaton, sparse::DFA}, -/// HalfMatch, -/// }; +/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; -/// let expected = HalfMatch::must(0, 8); -/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); +/// let expected = Some(HalfMatch::must(0, 8)); +/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[derive(Clone)] @@ -130,12 +127,15 @@ pub struct DFA<T> { // // That is, a lot of the complexity is pushed down into how each state // itself is represented. - trans: Transitions<T>, - starts: StartTable<T>, + tt: Transitions<T>, + st: StartTable<T>, special: Special, + pre: Option<Prefilter>, + quitset: ByteSet, + flags: Flags, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl DFA<Vec<u8>> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. @@ -149,18 +149,16 @@ impl DFA<Vec<u8>> { /// # Example /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; /// /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; /// - /// let expected = HalfMatch::must(0, 11); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// let expected = Some(HalfMatch::must(0, 11)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> { + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> { dense::Builder::new() .build(pattern) .and_then(|dense| dense.to_sparse()) @@ -178,26 +176,24 @@ impl DFA<Vec<u8>> { /// # Example /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; /// /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; - /// let expected = HalfMatch::must(1, 3); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?); + /// let expected = Some(HalfMatch::must(1, 3)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` + #[cfg(feature = "syntax")] pub fn new_many<P: AsRef<str>>( patterns: &[P], - ) -> Result<DFA<Vec<u8>>, Error> { + ) -> Result<DFA<Vec<u8>>, BuildError> { dense::Builder::new() .build_many(patterns) .and_then(|dense| dense.to_sparse()) } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl DFA<Vec<u8>> { /// Create a new DFA that matches every input. /// @@ -206,17 +202,17 @@ impl DFA<Vec<u8>> { /// ``` /// use regex_automata::{ /// dfa::{Automaton, sparse}, - /// HalfMatch, + /// HalfMatch, Input, /// }; /// /// let dfa = sparse::DFA::always_match()?; /// - /// let expected = HalfMatch::must(0, 0); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?); + /// let expected = Some(HalfMatch::must(0, 0)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn always_match() -> Result<DFA<Vec<u8>>, Error> { + pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> { dense::DFA::always_match()?.to_sparse() } @@ -225,21 +221,21 @@ impl DFA<Vec<u8>> { /// # Example /// /// ``` - /// use regex_automata::dfa::{Automaton, sparse}; + /// use regex_automata::{dfa::{Automaton, sparse}, Input}; /// /// let dfa = sparse::DFA::never_match()?; - /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?); - /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?); + /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); + /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn never_match() -> Result<DFA<Vec<u8>>, Error> { + pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> { dense::DFA::never_match()?.to_sparse() } /// The implementation for constructing a sparse DFA from a dense DFA. pub(crate) fn from_dense<T: AsRef<[u32]>>( dfa: &dense::DFA<T>, - ) -> Result<DFA<Vec<u8>>, Error> { + ) -> Result<DFA<Vec<u8>>, BuildError> { // In order to build the transition table, we need to be able to write // state identifiers for each of the "next" transitions in each state. // Our state identifiers correspond to the byte offset in the @@ -249,35 +245,35 @@ impl DFA<Vec<u8>> { // of the transition table happens in two passes. // // In the first pass, we fill out the shell of each state, which - // includes the transition count, the input byte ranges and zero-filled - // space for the transitions and accelerators, if present. In this - // first pass, we also build up a map from the state identifier index - // of the dense DFA to the state identifier in this sparse DFA. + // includes the transition length, the input byte ranges and + // zero-filled space for the transitions and accelerators, if present. + // In this first pass, we also build up a map from the state identifier + // index of the dense DFA to the state identifier in this sparse DFA. // // In the second pass, we fill in the transitions based on the map // built in the first pass. // The capacity given here reflects a minimum. (Well, the true minimum // is likely even bigger, but hopefully this saves a few reallocs.) - let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count()); + let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len()); // This maps state indices from the dense DFA to StateIDs in the sparse // DFA. We build out this map on the first pass, and then use it in the // second pass to back-fill our transitions. - let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()]; + let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()]; for state in dfa.states() { let pos = sparse.len(); - remap[dfa.to_index(state.id())] = - StateID::new(pos).map_err(|_| Error::too_many_states())?; - // zero-filled space for the transition count + remap[dfa.to_index(state.id())] = StateID::new(pos) + .map_err(|_| BuildError::too_many_states())?; + // zero-filled space for the transition length sparse.push(0); sparse.push(0); - let mut transition_count = 0; + let mut transition_len = 0; for (unit1, unit2, _) in state.sparse_transitions() { match (unit1.as_u8(), unit2.as_u8()) { (Some(b1), Some(b2)) => { - transition_count += 1; + transition_len += 1; sparse.push(b1); sparse.push(b2); } @@ -298,40 +294,40 @@ impl DFA<Vec<u8>> { // N.B. The loop above is not guaranteed to yield the EOI // transition, since it may point to a DEAD state. By putting // it here, we always write the EOI transition, and thus - // guarantee that our transition count is >0. Why do we always + // guarantee that our transition length is >0. Why do we always // need the EOI transition? Because in order to implement // Automaton::next_eoi_state, this lets us just ask for the last // transition. There are probably other/better ways to do this. - transition_count += 1; + transition_len += 1; sparse.push(0); sparse.push(0); - // Check some assumptions about transition count. + // Check some assumptions about transition length. assert_ne!( - transition_count, 0, - "transition count should be non-zero", + transition_len, 0, + "transition length should be non-zero", ); assert!( - transition_count <= 257, - "expected transition count {} to be <= 257", - transition_count, + transition_len <= 257, + "expected transition length {} to be <= 257", + transition_len, ); - // Fill in the transition count. - // Since transition count is always <= 257, we use the most + // Fill in the transition length. + // Since transition length is always <= 257, we use the most // significant bit to indicate whether this is a match state or // not. let ntrans = if dfa.is_match_state(state.id()) { - transition_count | (1 << 15) + transition_len | (1 << 15) } else { - transition_count + transition_len }; - bytes::NE::write_u16(ntrans, &mut sparse[pos..]); + wire::NE::write_u16(ntrans, &mut sparse[pos..]); // zero-fill the actual transitions. - // Unwraps are OK since transition_count <= 257 and our minimum + // Unwraps are OK since transition_length <= 257 and our minimum // support usize size is 16-bits. - let zeros = usize::try_from(transition_count) + let zeros = usize::try_from(transition_len) .unwrap() .checked_mul(StateID::SIZE) .unwrap(); @@ -355,18 +351,18 @@ impl DFA<Vec<u8>> { sparse.extend(iter::repeat(0).take(zeros)); // Now write the length prefix. - bytes::NE::write_u32( + wire::NE::write_u32( // Will never fail since u32::MAX is invalid pattern ID. // Thus, the number of pattern IDs is representable by a // u32. - plen.try_into().expect("pattern ID count fits in u32"), + plen.try_into().expect("pattern ID length fits in u32"), &mut sparse[pos..], ); pos += size_of::<u32>(); // Now write the pattern IDs. for &pid in dfa.pattern_id_slice(state.id()) { - pos += bytes::write_pattern_id::<bytes::NE>( + pos += wire::write_pattern_id::<wire::NE>( pid, &mut sparse[pos..], ); @@ -384,28 +380,31 @@ impl DFA<Vec<u8>> { } let mut new = DFA { - trans: Transitions { + tt: Transitions { sparse, classes: dfa.byte_classes().clone(), - count: dfa.state_count(), - patterns: dfa.pattern_count(), + state_len: dfa.state_len(), + pattern_len: dfa.pattern_len(), }, - starts: StartTable::from_dense_dfa(dfa, &remap)?, + st: StartTable::from_dense_dfa(dfa, &remap)?, special: dfa.special().remap(|id| remap[dfa.to_index(id)]), + pre: dfa.get_prefilter().map(|p| p.clone()), + quitset: dfa.quitset().clone(), + flags: dfa.flags().clone(), }; // And here's our second pass. Iterate over all of the dense states // again, and update the transitions in each of the states in the // sparse DFA. for old_state in dfa.states() { let new_id = remap[dfa.to_index(old_state.id())]; - let mut new_state = new.trans.state_mut(new_id); + let mut new_state = new.tt.state_mut(new_id); let sparse = old_state.sparse_transitions(); for (i, (_, _, next)) in sparse.enumerate() { let next = remap[dfa.to_index(next)]; new_state.set_next_at(i, next); } } - trace!( + debug!( "created sparse DFA, memory usage: {} (dense memory usage: {})", new.memory_usage(), dfa.memory_usage(), @@ -419,9 +418,12 @@ impl<T: AsRef<[u8]>> DFA<T> { /// DFA returned always uses `&[u8]` for its transitions. pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { DFA { - trans: self.trans.as_ref(), - starts: self.starts.as_ref(), + tt: self.tt.as_ref(), + st: self.st.as_ref(), special: self.special, + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, } } @@ -431,36 +433,67 @@ impl<T: AsRef<[u8]>> DFA<T> { /// Effectively, this returns a sparse DFA whose transitions live on the /// heap. #[cfg(feature = "alloc")] - pub fn to_owned(&self) -> DFA<Vec<u8>> { + pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> { DFA { - trans: self.trans.to_owned(), - starts: self.starts.to_owned(), + tt: self.tt.to_owned(), + st: self.st.to_owned(), special: self.special, + pre: self.pre.clone(), + quitset: self.quitset, + flags: self.flags, } } - /// Returns the memory usage, in bytes, of this DFA. + /// Returns the starting state configuration for this DFA. /// - /// The memory usage is computed based on the number of bytes used to - /// represent this DFA. - /// - /// This does **not** include the stack size used up by this DFA. To - /// compute that, use `std::mem::size_of::<sparse::DFA>()`. - pub fn memory_usage(&self) -> usize { - self.trans.memory_usage() + self.starts.memory_usage() + /// The default is [`StartKind::Both`], which means the DFA supports both + /// unanchored and anchored searches. However, this can generally lead to + /// bigger DFAs. Therefore, a DFA might be compiled with support for just + /// unanchored or anchored searches. In that case, running a search with + /// an unsupported configuration will panic. + pub fn start_kind(&self) -> StartKind { + self.st.kind } /// Returns true only if this DFA has starting states for each pattern. /// /// When a DFA has starting states for each pattern, then a search with the /// DFA can be configured to only look for anchored matches of a specific - /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`] - /// can accept a non-None `pattern_id` if and only if this method returns - /// true. Otherwise, calling `find_earliest_fwd_at` will panic. + /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can + /// accept a [`Anchored::Pattern`] if and only if this method returns true. + /// Otherwise, an error will be returned. /// /// Note that if the DFA is empty, this always returns false. - pub fn has_starts_for_each_pattern(&self) -> bool { - self.starts.patterns > 0 + pub fn starts_for_each_pattern(&self) -> bool { + self.st.pattern_len.is_some() + } + + /// Returns the equivalence classes that make up the alphabet for this DFA. + /// + /// Unless [`dense::Config::byte_classes`] was disabled, it is possible + /// that multiple distinct bytes are grouped into the same equivalence + /// class if it is impossible for them to discriminate between a match and + /// a non-match. This has the effect of reducing the overall alphabet size + /// and in turn potentially substantially reducing the size of the DFA's + /// transition table. + /// + /// The downside of using equivalence classes like this is that every state + /// transition will automatically use this map to convert an arbitrary + /// byte to its corresponding equivalence class. In practice this has a + /// negligible impact on performance. + pub fn byte_classes(&self) -> &ByteClasses { + &self.tt.classes + } + + /// Returns the memory usage, in bytes, of this DFA. + /// + /// The memory usage is computed based on the number of bytes used to + /// represent this DFA. + /// + /// This does **not** include the stack size used up by this DFA. To + /// compute that, use `std::mem::size_of::<sparse::DFA>()`. + pub fn memory_usage(&self) -> usize { + self.tt.memory_usage() + self.st.memory_usage() } } @@ -488,10 +521,7 @@ impl<T: AsRef<[u8]>> DFA<T> { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -503,13 +533,13 @@ impl<T: AsRef<[u8]>> DFA<T> { /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_little_endian(&self) -> Vec<u8> { - self.to_bytes::<bytes::LE>() + self.to_bytes::<wire::LE>() } /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian @@ -533,10 +563,7 @@ impl<T: AsRef<[u8]>> DFA<T> { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -548,13 +575,13 @@ impl<T: AsRef<[u8]>> DFA<T> { /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_big_endian(&self) -> Vec<u8> { - self.to_bytes::<bytes::BE>() + self.to_bytes::<wire::BE>() } /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian @@ -587,10 +614,7 @@ impl<T: AsRef<[u8]>> DFA<T> { /// This example shows how to serialize and deserialize a DFA: /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -600,18 +624,18 @@ impl<T: AsRef<[u8]>> DFA<T> { /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] pub fn to_bytes_native_endian(&self) -> Vec<u8> { - self.to_bytes::<bytes::NE>() + self.to_bytes::<wire::NE>() } /// The implementation of the public `to_bytes` serialization methods, /// which is generic over endianness. - #[cfg(feature = "alloc")] + #[cfg(feature = "dfa-build")] fn to_bytes<E: Endian>(&self) -> Vec<u8> { let mut buf = vec![0; self.write_to_len()]; // This should always succeed since the only possible serialization @@ -645,10 +669,7 @@ impl<T: AsRef<[u8]>> DFA<T> { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -660,15 +681,15 @@ impl<T: AsRef<[u8]>> DFA<T> { /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn write_to_little_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError> { - self.write_to::<bytes::LE>(dst) + self.write_to::<wire::LE>(dst) } /// Serialize this DFA as raw bytes to the given slice, in big endian @@ -695,10 +716,7 @@ impl<T: AsRef<[u8]>> DFA<T> { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -710,15 +728,15 @@ impl<T: AsRef<[u8]>> DFA<T> { /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn write_to_big_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError> { - self.write_to::<bytes::BE>(dst) + self.write_to::<wire::BE>(dst) } /// Serialize this DFA as raw bytes to the given slice, in native endian @@ -754,10 +772,7 @@ impl<T: AsRef<[u8]>> DFA<T> { /// dynamic memory allocation. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -767,15 +782,15 @@ impl<T: AsRef<[u8]>> DFA<T> { /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn write_to_native_endian( &self, dst: &mut [u8], ) -> Result<usize, SerializeError> { - self.write_to::<bytes::NE>(dst) + self.write_to::<wire::NE>(dst) } /// The implementation of the public `write_to` serialization methods, @@ -785,17 +800,19 @@ impl<T: AsRef<[u8]>> DFA<T> { dst: &mut [u8], ) -> Result<usize, SerializeError> { let mut nw = 0; - nw += bytes::write_label(LABEL, &mut dst[nw..])?; - nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?; - nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?; + nw += wire::write_label(LABEL, &mut dst[nw..])?; + nw += wire::write_endianness_check::<E>(&mut dst[nw..])?; + nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?; nw += { // Currently unused, intended for future flexibility E::write_u32(0, &mut dst[nw..]); size_of::<u32>() }; - nw += self.trans.write_to::<E>(&mut dst[nw..])?; - nw += self.starts.write_to::<E>(&mut dst[nw..])?; + nw += self.flags.write_to::<E>(&mut dst[nw..])?; + nw += self.tt.write_to::<E>(&mut dst[nw..])?; + nw += self.st.write_to::<E>(&mut dst[nw..])?; nw += self.special.write_to::<E>(&mut dst[nw..])?; + nw += self.quitset.write_to::<E>(&mut dst[nw..])?; Ok(nw) } @@ -817,10 +834,7 @@ impl<T: AsRef<[u8]>> DFA<T> { /// a sparse DFA. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; @@ -829,18 +843,20 @@ impl<T: AsRef<[u8]>> DFA<T> { /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn write_to_len(&self) -> usize { - bytes::write_label_len(LABEL) - + bytes::write_endianness_check_len() - + bytes::write_version_len() + wire::write_label_len(LABEL) + + wire::write_endianness_check_len() + + wire::write_version_len() + size_of::<u32>() // unused, intended for future flexibility - + self.trans.write_to_len() - + self.starts.write_to_len() + + self.flags.write_to_len() + + self.tt.write_to_len() + + self.st.write_to_len() + self.special.write_to_len() + + self.quitset.write_to_len() } } @@ -901,17 +917,14 @@ impl<'a> DFA<&'a [u8]> { /// and then use it for searching. /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let bytes = initial.to_bytes_native_endian(); /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// @@ -927,7 +940,7 @@ impl<'a> DFA<&'a [u8]> { /// a file: /// /// ```no_run - /// use regex_automata::dfa::{Automaton, sparse::DFA}; + /// use regex_automata::dfa::sparse::DFA; /// /// let dfa = DFA::new("foo[0-9]+")?; /// @@ -943,23 +956,22 @@ impl<'a> DFA<&'a [u8]> { /// /// And now the second part is embedding the DFA into the compiled program /// and deserializing it at runtime on first use. We use conditional - /// compilation to choose the correct endianness. As mentioned above, we - /// do not need to employ any special tricks to ensure a proper alignment, - /// since a sparse DFA has no alignment requirements. + /// compilation to choose the correct endianness. We do not need to employ + /// any special tricks to ensure a proper alignment, since a sparse DFA has + /// no alignment requirements. /// /// ```no_run /// use regex_automata::{ - /// dfa::{Automaton, sparse}, - /// HalfMatch, + /// dfa::{Automaton, sparse::DFA}, + /// util::lazy::Lazy, + /// HalfMatch, Input, /// }; /// - /// type DFA = sparse::DFA<&'static [u8]>; - /// - /// fn get_foo() -> &'static DFA { - /// use std::cell::Cell; - /// use std::mem::MaybeUninit; - /// use std::sync::Once; - /// + /// // This crate provides its own "lazy" type, kind of like + /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc + /// // no-std environments and let's us write this using completely + /// // safe code. + /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| { /// # const _: &str = stringify! { /// #[cfg(target_endian = "big")] /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); @@ -968,33 +980,13 @@ impl<'a> DFA<&'a [u8]> { /// # }; /// # static BYTES: &[u8] = b""; /// - /// struct Lazy(Cell<MaybeUninit<DFA>>); - /// // SAFETY: This is safe because DFA impls Sync. - /// unsafe impl Sync for Lazy {} - /// - /// static INIT: Once = Once::new(); - /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit())); - /// - /// INIT.call_once(|| { - /// let (dfa, _) = DFA::from_bytes(BYTES) - /// .expect("serialized DFA should be valid"); - /// // SAFETY: This is guaranteed to only execute once, and all - /// // we do with the pointer is write the DFA to it. - /// unsafe { - /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa); - /// } - /// }); - /// // SAFETY: DFA is guaranteed to by initialized via INIT and is - /// // stored in static memory. - /// unsafe { - /// let dfa = (*DFA.0.as_ptr()).as_ptr(); - /// std::mem::transmute::<*const DFA, &'static DFA>(dfa) - /// } - /// } - /// - /// let dfa = get_foo(); - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345")); + /// let (dfa, _) = DFA::from_bytes(BYTES) + /// .expect("serialized DFA should be valid"); + /// dfa + /// }); + /// + /// let expected = Ok(Some(HalfMatch::must(0, 8))); + /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); /// ``` /// /// Alternatively, consider using @@ -1009,8 +1001,8 @@ impl<'a> DFA<&'a [u8]> { // (by trying to decode every state) and start state ID list below. If // either validation fails, then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; - dfa.trans.validate()?; - dfa.starts.validate(&dfa.trans)?; + dfa.tt.validate(&dfa.special)?; + dfa.st.validate(&dfa.special, &dfa.tt)?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. Ok((dfa, nread)) @@ -1029,23 +1021,20 @@ impl<'a> DFA<&'a [u8]> { /// /// # Safety /// - /// This routine is unsafe because it permits callers to provide + /// This routine is not safe because it permits callers to provide /// arbitrary transitions with possibly incorrect state identifiers. While /// the various serialization routines will never return an incorrect - /// DFA, there is no guarantee that the bytes provided here - /// are correct. While `from_bytes_unchecked` will still do several forms - /// of basic validation, this routine does not check that the transitions - /// themselves are correct. Given an incorrect transition table, it is - /// possible for the search routines to access out-of-bounds memory because - /// of explicit bounds check elision. + /// DFA, there is no guarantee that the bytes provided here are correct. + /// While `from_bytes_unchecked` will still do several forms of basic + /// validation, this routine does not check that the transitions themselves + /// are correct. Given an incorrect transition table, it is possible for + /// the search routines to access out-of-bounds memory because of explicit + /// bounds check elision. /// /// # Example /// /// ``` - /// use regex_automata::{ - /// dfa::{Automaton, sparse::DFA}, - /// HalfMatch, - /// }; + /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let bytes = initial.to_bytes_native_endian(); @@ -1053,8 +1042,8 @@ impl<'a> DFA<&'a [u8]> { /// // directly from a compatible serialization routine. /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; /// - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?); + /// let expected = Some(HalfMatch::must(0, 8)); + /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub unsafe fn from_bytes_unchecked( @@ -1062,56 +1051,70 @@ impl<'a> DFA<&'a [u8]> { ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { let mut nr = 0; - nr += bytes::read_label(&slice[nr..], LABEL)?; - nr += bytes::read_endianness_check(&slice[nr..])?; - nr += bytes::read_version(&slice[nr..], VERSION)?; + nr += wire::read_label(&slice[nr..], LABEL)?; + nr += wire::read_endianness_check(&slice[nr..])?; + nr += wire::read_version(&slice[nr..], VERSION)?; - let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?; + let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; nr += size_of::<u32>(); - let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; + let (flags, nread) = Flags::from_bytes(&slice[nr..])?; + nr += nread; + + let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; nr += nread; - let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; + let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; nr += nread; let (special, nread) = Special::from_bytes(&slice[nr..])?; nr += nread; - if special.max.as_usize() >= trans.sparse().len() { + if special.max.as_usize() >= tt.sparse().len() { return Err(DeserializeError::generic( "max should not be greater than or equal to sparse bytes", )); } - Ok((DFA { trans, starts, special }, nr)) + let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; + nr += nread; + + // Prefilters don't support serialization, so they're always absent. + let pre = None; + Ok((DFA { tt, st, special, pre, quitset, flags }, nr)) } } impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "sparse::DFA(")?; - for state in self.trans.states() { + for state in self.tt.states() { fmt_state_indicator(f, self, state.id())?; - writeln!(f, "{:06?}: {:?}", state.id(), state)?; + writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?; } writeln!(f, "")?; - for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() { - if i % self.starts.stride == 0 { - match pid { - None => writeln!(f, "START-GROUP(ALL)")?, - Some(pid) => { - writeln!(f, "START_GROUP(pattern: {:?})", pid)? - } + for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() { + if i % self.st.stride == 0 { + match anchored { + Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, + Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, + Anchored::Pattern(pid) => writeln!( + f, + "START_GROUP(pattern: {:?})", + pid.as_usize() + )?, } } writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; } - writeln!(f, "state count: {:?}", self.trans.count)?; + writeln!(f, "state length: {:?}", self.tt.state_len)?; + writeln!(f, "pattern length: {:?}", self.pattern_len())?; + writeln!(f, "flags: {:?}", self.flags)?; writeln!(f, ")")?; Ok(()) } } +// SAFETY: We assert that our implementation of each method is correct. unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { #[inline] fn is_special_state(&self, id: StateID) -> bool { @@ -1145,10 +1148,10 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { // This is marked as inline to help dramatically boost sparse searching, // which decodes each state it enters to follow the next transition. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn next_state(&self, current: StateID, input: u8) -> StateID { - let input = self.trans.classes.get(input); - self.trans.state(current).next(input) + let input = self.tt.classes.get(input); + self.tt.state(current).next(input) } #[inline] @@ -1162,17 +1165,17 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { #[inline] fn next_eoi_state(&self, current: StateID) -> StateID { - self.trans.state(current).next_eoi() + self.tt.state(current).next_eoi() } #[inline] - fn pattern_count(&self) -> usize { - self.trans.patterns + fn pattern_len(&self) -> usize { + self.tt.pattern_len } #[inline] - fn match_count(&self, id: StateID) -> usize { - self.trans.state(id).pattern_count() + fn match_len(&self, id: StateID) -> usize { + self.tt.state(id).pattern_len() } #[inline] @@ -1182,39 +1185,76 @@ unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> { // that finds the pattern ID from the state machine, which requires // a bit of slicing/pointer-chasing. This optimization tends to only // matter when matches are frequent. - if self.trans.patterns == 1 { + if self.tt.pattern_len == 1 { return PatternID::ZERO; } - self.trans.state(id).pattern_id(match_index) + self.tt.state(id).pattern_id(match_index) + } + + #[inline] + fn has_empty(&self) -> bool { + self.flags.has_empty + } + + #[inline] + fn is_utf8(&self) -> bool { + self.flags.is_utf8 + } + + #[inline] + fn is_always_start_anchored(&self) -> bool { + self.flags.is_always_start_anchored } #[inline] fn start_state_forward( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID { - let index = Start::from_position_fwd(bytes, start, end); - self.starts.start(index, pattern_id) + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + if !self.quitset.is_empty() && input.start() > 0 { + let offset = input.start() - 1; + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.fwd(&input); + self.st.start(input, start) } #[inline] fn start_state_reverse( &self, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> StateID { - let index = Start::from_position_rev(bytes, start, end); - self.starts.start(index, pattern_id) + input: &Input<'_>, + ) -> Result<StateID, MatchError> { + if !self.quitset.is_empty() && input.end() < input.haystack().len() { + let offset = input.end(); + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start = self.st.start_map.rev(&input); + self.st.start(input, start) + } + + #[inline] + fn universal_start_state(&self, mode: Anchored) -> Option<StateID> { + match mode { + Anchored::No => self.st.universal_start_unanchored, + Anchored::Yes => self.st.universal_start_anchored, + Anchored::Pattern(_) => None, + } } #[inline] fn accelerator(&self, id: StateID) -> &[u8] { - self.trans.state(id).accelerator() + self.tt.state(id).accelerator() + } + + #[inline] + fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref() } } @@ -1278,43 +1318,38 @@ struct Transitions<T> { /// least one state---the dead state---even the empty DFA. In particular, /// the dead state always has ID 0 and is correspondingly always the first /// state. The dead state is never a match state. - count: usize, + state_len: usize, /// The total number of unique patterns represented by these match states. - patterns: usize, + pattern_len: usize, } impl<'a> Transitions<&'a [u8]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { - let slice_start = slice.as_ptr() as usize; + let slice_start = slice.as_ptr().as_usize(); - let (state_count, nr) = - bytes::try_read_u32_as_usize(&slice, "state count")?; + let (state_len, nr) = + wire::try_read_u32_as_usize(&slice, "state length")?; slice = &slice[nr..]; - let (pattern_count, nr) = - bytes::try_read_u32_as_usize(&slice, "pattern count")?; + let (pattern_len, nr) = + wire::try_read_u32_as_usize(&slice, "pattern length")?; slice = &slice[nr..]; let (classes, nr) = ByteClasses::from_bytes(&slice)?; slice = &slice[nr..]; let (len, nr) = - bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?; + wire::try_read_u32_as_usize(&slice, "sparse transitions length")?; slice = &slice[nr..]; - bytes::check_slice_len(slice, len, "sparse states byte length")?; + wire::check_slice_len(slice, len, "sparse states byte length")?; let sparse = &slice[..len]; slice = &slice[len..]; - let trans = Transitions { - sparse, - classes, - count: state_count, - patterns: pattern_count, - }; - Ok((trans, slice.as_ptr() as usize - slice_start)) + let trans = Transitions { sparse, classes, state_len, pattern_len }; + Ok((trans, slice.as_ptr().as_usize() - slice_start)) } } @@ -1334,12 +1369,12 @@ impl<T: AsRef<[u8]>> Transitions<T> { } dst = &mut dst[..nwrite]; - // write state count - E::write_u32(u32::try_from(self.count).unwrap(), dst); + // write state length + E::write_u32(u32::try_from(self.state_len).unwrap(), dst); dst = &mut dst[size_of::<u32>()..]; - // write pattern count - E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + // write pattern length + E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); dst = &mut dst[size_of::<u32>()..]; // write byte class map @@ -1351,15 +1386,22 @@ impl<T: AsRef<[u8]>> Transitions<T> { dst = &mut dst[size_of::<u32>()..]; // write actual transitions - dst.copy_from_slice(self.sparse()); + let mut id = DEAD; + while id.as_usize() < self.sparse().len() { + let state = self.state(id); + let n = state.write_to::<E>(&mut dst)?; + dst = &mut dst[n..]; + // The next ID is the offset immediately following `state`. + id = StateID::new(id.as_usize() + state.write_to_len()).unwrap(); + } Ok(nwrite) } /// Returns the number of bytes the serialized form of this transition /// table will use. fn write_to_len(&self) -> usize { - size_of::<u32>() // state count - + size_of::<u32>() // pattern count + size_of::<u32>() // state length + + size_of::<u32>() // pattern length + self.classes.write_to_len() + size_of::<u32>() // sparse transitions length + self.sparse().len() @@ -1369,7 +1411,7 @@ impl<T: AsRef<[u8]>> Transitions<T> { /// /// That is, every state ID can be used to correctly index a state in this /// table. - fn validate(&self) -> Result<(), DeserializeError> { + fn validate(&self, sp: &Special) -> Result<(), DeserializeError> { // In order to validate everything, we not only need to make sure we // can decode every state, but that every transition in every state // points to a valid state. There are many duplicative transitions, so @@ -1381,10 +1423,22 @@ impl<T: AsRef<[u8]>> Transitions<T> { // whether doing something more clever is worth it just yet. If you're // profiling this code and need it to run faster, please file an issue. // + // OK, so we also use this to record the set of valid state IDs. Since + // it is possible for a transition to point to an invalid state ID that + // still (somehow) deserializes to a valid state. So we need to make + // sure our transitions are limited to actually correct state IDs. + // The problem is, I'm not sure how to do this verification step in + // no-std no-alloc mode. I think we'd *have* to store the set of valid + // state IDs in the DFA itself. For now, we don't do this verification + // in no-std no-alloc mode. The worst thing that can happen is an + // incorrect result. But no panics or memory safety problems should + // result. Because we still do validate that the state itself is + // "valid" in the sense that everything it points to actually exists. + // // ---AG struct Seen { #[cfg(feature = "alloc")] - set: BTreeSet<StateID>, + set: alloc::collections::BTreeSet<StateID>, #[cfg(not(feature = "alloc"))] set: core::marker::PhantomData<StateID>, } @@ -1392,7 +1446,7 @@ impl<T: AsRef<[u8]>> Transitions<T> { #[cfg(feature = "alloc")] impl Seen { fn new() -> Seen { - Seen { set: BTreeSet::new() } + Seen { set: alloc::collections::BTreeSet::new() } } fn insert(&mut self, id: StateID) { self.set.insert(id); @@ -1416,38 +1470,78 @@ impl<T: AsRef<[u8]>> Transitions<T> { let mut verified: Seen = Seen::new(); // We need to make sure that we decode the correct number of states. // Otherwise, an empty set of transitions would validate even if the - // recorded state count is non-empty. - let mut count = 0; + // recorded state length is non-empty. + let mut len = 0; // We can't use the self.states() iterator because it assumes the state // encodings are valid. It could panic if they aren't. let mut id = DEAD; while id.as_usize() < self.sparse().len() { - let state = self.try_state(id)?; + // Before we even decode the state, we check that the ID itself + // is well formed. That is, if it's a special state then it must + // actually be a quit, dead, accel, match or start state. + if sp.is_special_state(id) { + let is_actually_special = sp.is_dead_state(id) + || sp.is_quit_state(id) + || sp.is_match_state(id) + || sp.is_start_state(id) + || sp.is_accel_state(id); + if !is_actually_special { + // This is kind of a cryptic error message... + return Err(DeserializeError::generic( + "found sparse state tagged as special but \ + wasn't actually special", + )); + } + } + let state = self.try_state(sp, id)?; verified.insert(id); // The next ID should be the offset immediately following `state`. - id = StateID::new(bytes::add( + id = StateID::new(wire::add( id.as_usize(), - state.bytes_len(), + state.write_to_len(), "next state ID offset", )?) .map_err(|err| { DeserializeError::state_id_error(err, "next state ID offset") })?; - count += 1; - - // Now check that all transitions in this state are correct. + len += 1; + } + // Now that we've checked that all top-level states are correct and + // importantly, collected a set of valid state IDs, we have all the + // information we need to check that all transitions are correct too. + // + // Note that we can't use `valid_ids` to iterate because it will + // be empty in no-std no-alloc contexts. (And yes, that means our + // verification isn't quite as good.) We can use `self.states()` + // though at least, since we know that all states can at least be + // decoded and traversed correctly. + for state in self.states() { + // Check that all transitions in this state are correct. for i in 0..state.ntrans { let to = state.next_at(i); - if verified.contains(&to) { - continue; + // For no-alloc, we just check that the state can decode. It is + // technically possible that the state ID could still point to + // a non-existent state even if it decodes (fuzzing proved this + // to be true), but it shouldn't result in any memory unsafety + // or panics in non-debug mode. + #[cfg(not(feature = "alloc"))] + { + let _ = self.try_state(sp, to)?; + } + #[cfg(feature = "alloc")] + { + if !verified.contains(&to) { + return Err(DeserializeError::generic( + "found transition that points to a \ + non-existent state", + )); + } } - let _ = self.try_state(to)?; - verified.insert(id); } } - if count != self.count { + if len != self.state_len { return Err(DeserializeError::generic( - "mismatching sparse state count", + "mismatching sparse state length", )); } Ok(()) @@ -1458,19 +1552,19 @@ impl<T: AsRef<[u8]>> Transitions<T> { Transitions { sparse: self.sparse(), classes: self.classes.clone(), - count: self.count, - patterns: self.patterns, + state_len: self.state_len, + pattern_len: self.pattern_len, } } /// Converts these transitions to an owned value. #[cfg(feature = "alloc")] - fn to_owned(&self) -> Transitions<Vec<u8>> { + fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> { Transitions { sparse: self.sparse().to_vec(), classes: self.classes.clone(), - count: self.count, - patterns: self.patterns, + state_len: self.state_len, + pattern_len: self.pattern_len, } } @@ -1483,10 +1577,10 @@ impl<T: AsRef<[u8]>> Transitions<T> { /// functions involved are also inlined, which should hopefully eliminate /// a lot of the extraneous decoding that is never needed just to follow /// the next transition. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn state(&self, id: StateID) -> State<'_> { let mut state = &self.sparse()[id.as_usize()..]; - let mut ntrans = bytes::read_u16(&state) as usize; + let mut ntrans = wire::read_u16(&state).as_usize(); let is_match = (1 << 15) & ntrans != 0; ntrans &= !(1 << 15); state = &state[2..]; @@ -1494,13 +1588,13 @@ impl<T: AsRef<[u8]>> Transitions<T> { let (input_ranges, state) = state.split_at(ntrans * 2); let (next, state) = state.split_at(ntrans * StateID::SIZE); let (pattern_ids, state) = if is_match { - let npats = bytes::read_u32(&state) as usize; + let npats = wire::read_u32(&state).as_usize(); state[4..].split_at(npats * 4) } else { (&[][..], state) }; - let accel_len = state[0] as usize; + let accel_len = usize::from(state[0]); let accel = &state[1..accel_len + 1]; State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } } @@ -1513,27 +1607,44 @@ impl<T: AsRef<[u8]>> Transitions<T> { /// all of its data is consistent. It does not verify that its state ID /// transitions point to valid states themselves, nor does it verify that /// every pattern ID is valid. - fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> { + fn try_state( + &self, + sp: &Special, + id: StateID, + ) -> Result<State<'_>, DeserializeError> { if id.as_usize() > self.sparse().len() { - return Err(DeserializeError::generic("invalid sparse state ID")); + return Err(DeserializeError::generic( + "invalid caller provided sparse state ID", + )); } let mut state = &self.sparse()[id.as_usize()..]; // Encoding format starts with a u16 that stores the total number of // transitions in this state. let (mut ntrans, _) = - bytes::try_read_u16_as_usize(state, "state transition count")?; + wire::try_read_u16_as_usize(state, "state transition length")?; let is_match = ((1 << 15) & ntrans) != 0; ntrans &= !(1 << 15); state = &state[2..]; if ntrans > 257 || ntrans == 0 { - return Err(DeserializeError::generic("invalid transition count")); + return Err(DeserializeError::generic( + "invalid transition length", + )); + } + if is_match && !sp.is_match_state(id) { + return Err(DeserializeError::generic( + "state marked as match but not in match ID range", + )); + } else if !is_match && sp.is_match_state(id) { + return Err(DeserializeError::generic( + "state in match ID range but not marked as match state", + )); } // Each transition has two pieces: an inclusive range of bytes on which // it is defined, and the state ID that those bytes transition to. The // pairs come first, followed by a corresponding sequence of state IDs. let input_ranges_len = ntrans.checked_mul(2).unwrap(); - bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; + wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; let (input_ranges, state) = state.split_at(input_ranges_len); // Every range should be of the form A-B, where A<=B. for pair in input_ranges.chunks(2) { @@ -1549,13 +1660,13 @@ impl<T: AsRef<[u8]>> Transitions<T> { let next_len = ntrans .checked_mul(self.id_len()) .expect("state size * #trans should always fit in a usize"); - bytes::check_slice_len(state, next_len, "sparse trans state IDs")?; + wire::check_slice_len(state, next_len, "sparse trans state IDs")?; let (next, state) = state.split_at(next_len); // We can at least verify that every state ID is in bounds. for idbytes in next.chunks(self.id_len()) { let (id, _) = - bytes::read_state_id(idbytes, "sparse state ID in try_state")?; - bytes::check_slice_len( + wire::read_state_id(idbytes, "sparse state ID in try_state")?; + wire::check_slice_len( self.sparse(), id.as_usize(), "invalid sparse state ID", @@ -1567,19 +1678,24 @@ impl<T: AsRef<[u8]>> Transitions<T> { // encoded 32-bit integers. let (pattern_ids, state) = if is_match { let (npats, nr) = - bytes::try_read_u32_as_usize(state, "pattern ID count")?; + wire::try_read_u32_as_usize(state, "pattern ID length")?; let state = &state[nr..]; + if npats == 0 { + return Err(DeserializeError::generic( + "state marked as a match, but has no pattern IDs", + )); + } let pattern_ids_len = - bytes::mul(npats, 4, "sparse pattern ID byte length")?; - bytes::check_slice_len( + wire::mul(npats, 4, "sparse pattern ID byte length")?; + wire::check_slice_len( state, pattern_ids_len, "sparse pattern IDs", )?; let (pattern_ids, state) = state.split_at(pattern_ids_len); for patbytes in pattern_ids.chunks(PatternID::SIZE) { - bytes::read_pattern_id( + wire::read_pattern_id( patbytes, "sparse pattern ID in try_state", )?; @@ -1597,21 +1713,30 @@ impl<T: AsRef<[u8]>> Transitions<T> { if state.is_empty() { return Err(DeserializeError::generic("no accelerator length")); } - let (accel_len, state) = (state[0] as usize, &state[1..]); + let (accel_len, state) = (usize::from(state[0]), &state[1..]); if accel_len > 3 { return Err(DeserializeError::generic( "sparse invalid accelerator length", )); + } else if accel_len == 0 && sp.is_accel_state(id) { + return Err(DeserializeError::generic( + "got no accelerators in state, but in accelerator ID range", + )); + } else if accel_len > 0 && !sp.is_accel_state(id) { + return Err(DeserializeError::generic( + "state in accelerator ID range, but has no accelerators", + )); } - bytes::check_slice_len( + + wire::check_slice_len( state, accel_len, "sparse corrupt accelerator length", )?; let (accel, _) = (&state[..accel_len], &state[accel_len..]); - Ok(State { + let state = State { id, is_match, ntrans, @@ -1619,7 +1744,13 @@ impl<T: AsRef<[u8]>> Transitions<T> { next, pattern_ids, accel, - }) + }; + if sp.is_quit_state(state.next_at(state.ntrans - 1)) { + return Err(DeserializeError::generic( + "state with EOI transition to quit state is illegal", + )); + } + Ok(state) } /// Return an iterator over all of the states in this DFA. @@ -1648,13 +1779,13 @@ impl<T: AsRef<[u8]>> Transitions<T> { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl<T: AsMut<[u8]>> Transitions<T> { /// Return a convenient mutable representation of the given state. /// This panics if the state is invalid. fn state_mut(&mut self, id: StateID) -> StateMut<'_> { let mut state = &mut self.sparse_mut()[id.as_usize()..]; - let mut ntrans = bytes::read_u16(&state) as usize; + let mut ntrans = wire::read_u16(&state).as_usize(); let is_match = (1 << 15) & ntrans != 0; ntrans &= !(1 << 15); state = &mut state[2..]; @@ -1662,13 +1793,13 @@ impl<T: AsMut<[u8]>> Transitions<T> { let (input_ranges, state) = state.split_at_mut(ntrans * 2); let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); let (pattern_ids, state) = if is_match { - let npats = bytes::read_u32(&state) as usize; + let npats = wire::read_u32(&state).as_usize(); state[4..].split_at_mut(npats * 4) } else { (&mut [][..], state) }; - let accel_len = state[0] as usize; + let accel_len = usize::from(state[0]); let accel = &mut state[1..accel_len + 1]; StateMut { id, @@ -1702,53 +1833,85 @@ struct StartTable<T> { /// In practice, T is either Vec<u8> or &[u8] and has no alignment /// requirements. /// - /// The first `stride` (currently always 4) entries always correspond to - /// the start states for the entire DFA. After that, there are - /// `stride * patterns` state IDs, where `patterns` may be zero in the - /// case of a DFA with no patterns or in the case where the DFA was built - /// without enabling starting states for each pattern. + /// The first `2 * stride` (currently always 8) entries always correspond + /// to the starts states for the entire DFA, with the first 4 entries being + /// for unanchored searches and the second 4 entries being for anchored + /// searches. To keep things simple, we always use 8 entries even if the + /// `StartKind` is not both. + /// + /// After that, there are `stride * patterns` state IDs, where `patterns` + /// may be zero in the case of a DFA with no patterns or in the case where + /// the DFA was built without enabling starting states for each pattern. table: T, + /// The starting state configuration supported. When 'both', both + /// unanchored and anchored searches work. When 'unanchored', anchored + /// searches panic. When 'anchored', unanchored searches panic. + kind: StartKind, + /// The start state configuration for every possible byte. + start_map: StartByteMap, /// The number of starting state IDs per pattern. stride: usize, /// The total number of patterns for which starting states are encoded. - /// This may be zero for non-empty DFAs when the DFA was built without - /// start states for each pattern. - patterns: usize, + /// This is `None` for DFAs that were built without start states for each + /// pattern. Thus, one cannot use this field to say how many patterns + /// are in the DFA in all cases. It is specific to how many patterns are + /// represented in this start table. + pattern_len: Option<usize>, + /// The universal starting state for unanchored searches. This is only + /// present when the DFA supports unanchored searches and when all starting + /// state IDs for an unanchored search are equivalent. + universal_start_unanchored: Option<StateID>, + /// The universal starting state for anchored searches. This is only + /// present when the DFA supports anchored searches and when all starting + /// state IDs for an anchored search are equivalent. + universal_start_anchored: Option<StateID>, } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl StartTable<Vec<u8>> { - fn new(patterns: usize) -> StartTable<Vec<u8>> { - let stride = Start::count(); + fn new<T: AsRef<[u32]>>( + dfa: &dense::DFA<T>, + pattern_len: Option<usize>, + ) -> StartTable<Vec<u8>> { + let stride = Start::len(); // This is OK since the only way we're here is if a dense DFA could be // constructed successfully, which uses the same space. let len = stride - .checked_mul(patterns) + .checked_mul(pattern_len.unwrap_or(0)) .unwrap() - .checked_add(stride) + .checked_add(stride.checked_mul(2).unwrap()) .unwrap() .checked_mul(StateID::SIZE) .unwrap(); - StartTable { table: vec![0; len], stride, patterns } + StartTable { + table: vec![0; len], + kind: dfa.start_kind(), + start_map: dfa.start_map().clone(), + stride, + pattern_len, + universal_start_unanchored: dfa + .universal_start_state(Anchored::No), + universal_start_anchored: dfa.universal_start_state(Anchored::Yes), + } } fn from_dense_dfa<T: AsRef<[u32]>>( dfa: &dense::DFA<T>, remap: &[StateID], - ) -> Result<StartTable<Vec<u8>>, Error> { + ) -> Result<StartTable<Vec<u8>>, BuildError> { // Unless the DFA has start states compiled for each pattern, then // as far as the starting state table is concerned, there are zero // patterns to account for. It will instead only store starting states // for the entire DFA. - let start_pattern_count = if dfa.has_starts_for_each_pattern() { - dfa.pattern_count() + let start_pattern_len = if dfa.starts_for_each_pattern() { + Some(dfa.pattern_len()) } else { - 0 + None }; - let mut sl = StartTable::new(start_pattern_count); - for (old_start_id, sty, pid) in dfa.starts() { + let mut sl = StartTable::new(dfa, start_pattern_len); + for (old_start_id, anchored, sty) in dfa.starts() { let new_start_id = remap[dfa.to_index(old_start_id)]; - sl.set_start(sty, pid, new_start_id); + sl.set_start(anchored, sty, new_start_id); } Ok(sl) } @@ -1758,53 +1921,98 @@ impl<'a> StartTable<&'a [u8]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { - let slice_start = slice.as_ptr() as usize; + let slice_start = slice.as_ptr().as_usize(); - let (stride, nr) = - bytes::try_read_u32_as_usize(slice, "sparse start table stride")?; + let (kind, nr) = StartKind::from_bytes(slice)?; slice = &slice[nr..]; - let (patterns, nr) = bytes::try_read_u32_as_usize( - slice, - "sparse start table patterns", - )?; + let (start_map, nr) = StartByteMap::from_bytes(slice)?; slice = &slice[nr..]; - if stride != Start::count() { + let (stride, nr) = + wire::try_read_u32_as_usize(slice, "sparse start table stride")?; + slice = &slice[nr..]; + if stride != Start::len() { return Err(DeserializeError::generic( "invalid sparse starting table stride", )); } - if patterns > PatternID::LIMIT { + + let (maybe_pattern_len, nr) = + wire::try_read_u32_as_usize(slice, "sparse start table patterns")?; + slice = &slice[nr..]; + let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { + None + } else { + Some(maybe_pattern_len) + }; + if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { return Err(DeserializeError::generic( "sparse invalid number of patterns", )); } - let pattern_table_size = - bytes::mul(stride, patterns, "sparse invalid pattern count")?; + + let (universal_unanchored, nr) = + wire::try_read_u32(slice, "universal unanchored start")?; + slice = &slice[nr..]; + let universal_start_unanchored = if universal_unanchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_unanchored).map_err(|e| { + DeserializeError::state_id_error( + e, + "universal unanchored start", + ) + })?) + }; + + let (universal_anchored, nr) = + wire::try_read_u32(slice, "universal anchored start")?; + slice = &slice[nr..]; + let universal_start_anchored = if universal_anchored == u32::MAX { + None + } else { + Some(StateID::try_from(universal_anchored).map_err(|e| { + DeserializeError::state_id_error(e, "universal anchored start") + })?) + }; + + let pattern_table_size = wire::mul( + stride, + pattern_len.unwrap_or(0), + "sparse invalid pattern length", + )?; // Our start states always start with a single stride of start states // for the entire automaton which permit it to match any pattern. What // follows it are an optional set of start states for each pattern. - let start_state_count = bytes::add( - stride, + let start_state_len = wire::add( + wire::mul(2, stride, "start state stride too big")?, pattern_table_size, "sparse invalid 'any' pattern starts size", )?; - let table_bytes_len = bytes::mul( - start_state_count, + let table_bytes_len = wire::mul( + start_state_len, StateID::SIZE, "sparse pattern table bytes length", )?; - bytes::check_slice_len( + wire::check_slice_len( slice, table_bytes_len, "sparse start ID table", )?; - let table_bytes = &slice[..table_bytes_len]; + let table = &slice[..table_bytes_len]; slice = &slice[table_bytes_len..]; - let sl = StartTable { table: table_bytes, stride, patterns }; - Ok((sl, slice.as_ptr() as usize - slice_start)) + let sl = StartTable { + table, + kind, + start_map, + stride, + pattern_len, + universal_start_unanchored, + universal_start_anchored, + }; + Ok((sl, slice.as_ptr().as_usize() - slice_start)) } } @@ -1821,22 +2029,51 @@ impl<T: AsRef<[u8]>> StartTable<T> { } dst = &mut dst[..nwrite]; + // write start kind + let nw = self.kind.write_to::<E>(dst)?; + dst = &mut dst[nw..]; + // write start byte map + let nw = self.start_map.write_to(dst)?; + dst = &mut dst[nw..]; // write stride E::write_u32(u32::try_from(self.stride).unwrap(), dst); dst = &mut dst[size_of::<u32>()..]; - // write pattern count - E::write_u32(u32::try_from(self.patterns).unwrap(), dst); + // write pattern length + E::write_u32( + u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write universal start unanchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_unanchored + .map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); + dst = &mut dst[size_of::<u32>()..]; + // write universal start anchored state id, u32::MAX if absent + E::write_u32( + self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), + dst, + ); dst = &mut dst[size_of::<u32>()..]; // write start IDs - dst.copy_from_slice(self.table()); + for (sid, _, _) in self.iter() { + E::write_u32(sid.as_u32(), dst); + dst = &mut dst[StateID::SIZE..]; + } Ok(nwrite) } /// Returns the number of bytes the serialized form of this transition /// table will use. fn write_to_len(&self) -> usize { - size_of::<u32>() // stride + self.kind.write_to_len() + + self.start_map.write_to_len() + + size_of::<u32>() // stride + size_of::<u32>() // # patterns + + size_of::<u32>() // universal unanchored start + + size_of::<u32>() // universal anchored start + self.table().len() } @@ -1846,10 +2083,29 @@ impl<T: AsRef<[u8]>> StartTable<T> { /// state in the DFA's sparse transitions. fn validate( &self, + sp: &Special, trans: &Transitions<T>, ) -> Result<(), DeserializeError> { for (id, _, _) in self.iter() { - let _ = trans.try_state(id)?; + if sp.is_match_state(id) { + return Err(DeserializeError::generic( + "start states cannot be match states", + )); + } + // Confirm that the start state points to a valid state. + let state = trans.try_state(sp, id)?; + // And like for the transition table, confirm that the transitions + // on all start states themselves point to a valid state. + // + // It'd probably be better to integrate this validation with the + // transition table, or otherwise store a sorted sequence of all + // valid state IDs in the sparse DFA itself. That way, we could + // check that every pointer to a state corresponds precisely to a + // correct and valid state. + for i in 0..state.ntrans { + let to = state.next_at(i); + let _ = trans.try_state(sp, to)?; + } } Ok(()) } @@ -1858,18 +2114,26 @@ impl<T: AsRef<[u8]>> StartTable<T> { fn as_ref(&self) -> StartTable<&'_ [u8]> { StartTable { table: self.table(), + kind: self.kind, + start_map: self.start_map.clone(), stride: self.stride, - patterns: self.patterns, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, } } /// Converts this start list to an owned value. #[cfg(feature = "alloc")] - fn to_owned(&self) -> StartTable<Vec<u8>> { + fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> { StartTable { table: self.table().to_vec(), + kind: self.kind, + start_map: self.start_map.clone(), stride: self.stride, - patterns: self.patterns, + pattern_len: self.pattern_len, + universal_start_unanchored: self.universal_start_unanchored, + universal_start_anchored: self.universal_start_anchored, } } @@ -1879,26 +2143,45 @@ impl<T: AsRef<[u8]>> StartTable<T> { /// starting state for the given pattern is returned. If this start table /// does not have individual starting states for each pattern, then this /// panics. - fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID { - let start_index = index.as_usize(); - let index = match pattern_id { - None => start_index, - Some(pid) => { - let pid = pid.as_usize(); - assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); - self.stride - .checked_mul(pid) - .unwrap() - .checked_add(self.stride) - .unwrap() - .checked_add(start_index) - .unwrap() + fn start( + &self, + input: &Input<'_>, + start: Start, + ) -> Result<StateID, MatchError> { + let start_index = start.as_usize(); + let mode = input.get_anchored(); + let index = match mode { + Anchored::No => { + if !self.kind.has_unanchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + start_index + } + Anchored::Yes => { + if !self.kind.has_anchored() { + return Err(MatchError::unsupported_anchored(mode)); + } + self.stride + start_index + } + Anchored::Pattern(pid) => { + let len = match self.pattern_len { + None => { + return Err(MatchError::unsupported_anchored(mode)) + } + Some(len) => len, + }; + if pid.as_usize() >= len { + return Ok(DEAD); + } + (2 * self.stride) + + (self.stride * pid.as_usize()) + + start_index } }; let start = index * StateID::SIZE; // This OK since we're allowed to assume that the start table contains // valid StateIDs. - bytes::read_state_id_unchecked(&self.table()[start..]).0 + Ok(wire::read_state_id_unchecked(&self.table()[start..]).0) } /// Return an iterator over all start IDs in this table. @@ -1924,27 +2207,26 @@ impl<T: AsRef<[u8]>> StartTable<T> { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl<T: AsMut<[u8]>> StartTable<T> { /// Set the start state for the given index and pattern. /// /// If the pattern ID or state ID are not valid, then this will panic. - fn set_start( - &mut self, - index: Start, - pattern_id: Option<PatternID>, - id: StateID, - ) { - let start_index = index.as_usize(); - let index = match pattern_id { - None => start_index, - Some(pid) => { + fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { + let start_index = start.as_usize(); + let index = match anchored { + Anchored::No => start_index, + Anchored::Yes => self.stride + start_index, + Anchored::Pattern(pid) => { let pid = pid.as_usize(); - assert!(pid < self.patterns, "invalid pattern ID {:?}", pid); + let len = self + .pattern_len + .expect("start states for each pattern enabled"); + assert!(pid < len, "invalid pattern ID {:?}", pid); self.stride .checked_mul(pid) .unwrap() - .checked_add(self.stride) + .checked_add(self.stride.checked_mul(2).unwrap()) .unwrap() .checked_add(start_index) .unwrap() @@ -1952,7 +2234,7 @@ impl<T: AsMut<[u8]>> StartTable<T> { }; let start = index * StateID::SIZE; let end = start + StateID::SIZE; - bytes::write_state_id::<bytes::NE>( + wire::write_state_id::<wire::NE>( id, &mut self.table.as_mut()[start..end], ); @@ -1966,9 +2248,9 @@ struct StartStateIter<'a, T> { } impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { - type Item = (StateID, Start, Option<PatternID>); + type Item = (StateID, Anchored, Start); - fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> { + fn next(&mut self) -> Option<(StateID, Anchored, Start)> { let i = self.i; if i >= self.st.len() { return None; @@ -1978,18 +2260,13 @@ impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { // This unwrap is okay since the stride of any DFA must always match // the number of start state types. let start_type = Start::from_usize(i % self.st.stride).unwrap(); - let pid = if i < self.st.stride { - // This means we don't have start states for each pattern. - None + let anchored = if i < self.st.stride { + Anchored::No + } else if i < (2 * self.st.stride) { + Anchored::Yes } else { - // These unwraps are OK since we may assume our table and stride - // is correct. - let pid = i - .checked_sub(self.st.stride) - .unwrap() - .checked_div(self.st.stride) - .unwrap(); - Some(PatternID::new(pid).unwrap()) + let pid = (i - (2 * self.st.stride)) / self.st.stride; + Anchored::Pattern(PatternID::new(pid).unwrap()) }; let start = i * StateID::SIZE; let end = start + StateID::SIZE; @@ -1997,7 +2274,7 @@ impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { // This is OK since we're allowed to assume that any IDs in this start // table are correct and valid for this DFA. let id = StateID::from_ne_bytes_unchecked(bytes); - Some((id, start_type, pid)) + Some((id, anchored, start_type)) } } @@ -2024,7 +2301,7 @@ impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { return None; } let state = self.trans.state(StateID::new_unchecked(self.id)); - self.id = self.id + state.bytes_len(); + self.id = self.id + state.write_to_len(); Some(state) } } @@ -2071,7 +2348,7 @@ impl<'a> State<'a> { /// /// This is marked as inline to help dramatically boost sparse searching, /// which decodes each state it enters to follow the next transition. - #[inline(always)] + #[cfg_attr(feature = "perf-inline", inline(always))] fn next(&self, input: u8) -> StateID { // This straight linear search was observed to be much better than // binary search on ASCII haystacks, likely because a binary search @@ -2120,19 +2397,66 @@ impl<'a> State<'a> { /// is invalid, then this panics. fn pattern_id(&self, match_index: usize) -> PatternID { let start = match_index * PatternID::SIZE; - bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 + wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 } /// Returns the total number of pattern IDs for this state. This is always /// zero when `is_match` is false. - fn pattern_count(&self) -> usize { + fn pattern_len(&self) -> usize { assert_eq!(0, self.pattern_ids.len() % 4); self.pattern_ids.len() / 4 } + /// Return an accelerator for this state. + fn accelerator(&self) -> &'a [u8] { + self.accel + } + + /// Write the raw representation of this state to the given buffer using + /// the given endianness. + fn write_to<E: Endian>( + &self, + mut dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small( + "sparse state transitions", + )); + } + + let ntrans = + if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans }; + E::write_u16(u16::try_from(ntrans).unwrap(), dst); + dst = &mut dst[size_of::<u16>()..]; + + dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges); + dst = &mut dst[self.input_ranges.len()..]; + + for i in 0..self.ntrans { + E::write_u32(self.next_at(i).as_u32(), dst); + dst = &mut dst[StateID::SIZE..]; + } + + if self.is_match { + E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst); + dst = &mut dst[size_of::<u32>()..]; + for i in 0..self.pattern_len() { + let pid = self.pattern_id(i); + E::write_u32(pid.as_u32(), dst); + dst = &mut dst[PatternID::SIZE..]; + } + } + + dst[0] = u8::try_from(self.accel.len()).unwrap(); + dst[1..][..self.accel.len()].copy_from_slice(self.accel); + + Ok(nwrite) + } + /// Return the total number of bytes that this state consumes in its /// encoded form. - fn bytes_len(&self) -> usize { + fn write_to_len(&self) -> usize { let mut len = 2 + (self.ntrans * 2) + (self.ntrans * StateID::SIZE) @@ -2142,11 +2466,6 @@ impl<'a> State<'a> { } len } - - /// Return an accelerator for this state. - fn accelerator(&self) -> &'a [u8] { - self.accel - } } impl<'a> fmt::Debug for State<'a> { @@ -2163,14 +2482,14 @@ impl<'a> fmt::Debug for State<'a> { } let (start, end) = self.range(i); if start == end { - write!(f, "{:?} => {:?}", DebugByte(start), next)?; + write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?; } else { write!( f, "{:?}-{:?} => {:?}", DebugByte(start), DebugByte(end), - next, + next.as_usize(), )?; } printed = true; @@ -2180,7 +2499,7 @@ impl<'a> fmt::Debug for State<'a> { if printed { write!(f, ", ")?; } - write!(f, "EOI => {:?}", eoi)?; + write!(f, "EOI => {:?}", eoi.as_usize())?; } Ok(()) } @@ -2188,7 +2507,7 @@ impl<'a> fmt::Debug for State<'a> { /// A representation of a mutable sparse DFA state that can be cheaply /// materialized from a state identifier. -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] struct StateMut<'a> { /// The identifier of this state. id: StateID, @@ -2216,17 +2535,17 @@ struct StateMut<'a> { accel: &'a mut [u8], } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl<'a> StateMut<'a> { /// Sets the ith transition to the given state. fn set_next_at(&mut self, i: usize, next: StateID) { let start = i * StateID::SIZE; let end = start + StateID::SIZE; - bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]); + wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]); } } -#[cfg(feature = "alloc")] +#[cfg(feature = "dfa-build")] impl<'a> fmt::Debug for StateMut<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let state = State { @@ -2242,6 +2561,7 @@ impl<'a> fmt::Debug for StateMut<'a> { } } +/* /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs /// of input bytes that delineate an inclusive range of bytes. If the input @@ -2261,8 +2581,7 @@ impl<'a> fmt::Debug for StateMut<'a> { /// guaranteed to be safe and is thus UB (since I don't think the in-memory /// representation of `(u8, u8)` has been nailed down). One could define a /// repr(C) type, but the casting doesn't seem justified. -#[allow(dead_code)] -#[inline(always)] +#[cfg_attr(feature = "perf-inline", inline(always))] fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); debug_assert!(ranges.len() <= 512, "ranges should be short"); @@ -2281,3 +2600,57 @@ fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { } None } +*/ + +#[cfg(all(test, feature = "syntax", feature = "dfa-build"))] +mod tests { + use crate::{ + dfa::{dense::DFA, Automaton}, + nfa::thompson, + Input, MatchError, + }; + + // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. + #[test] + fn heuristic_unicode_forward() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap() + .to_sparse() + .unwrap(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_fwd(&input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_fwd(&input); + assert_eq!(Err(expected), got); + } + + // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. + #[test] + fn heuristic_unicode_reverse() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap() + .to_sparse() + .unwrap(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_rev(&input); + assert_eq!(Err(expected), got); + } +} diff --git a/vendor/regex-automata/src/dfa/special.rs b/vendor/regex-automata/src/dfa/special.rs index 3db95a707..a831df5c5 100644 --- a/vendor/regex-automata/src/dfa/special.rs +++ b/vendor/regex-automata/src/dfa/special.rs @@ -1,8 +1,8 @@ use crate::{ dfa::DEAD, util::{ - bytes::{self, DeserializeError, Endian, SerializeError}, - id::StateID, + primitives::StateID, + wire::{self, DeserializeError, Endian, SerializeError}, }, }; @@ -21,7 +21,7 @@ macro_rules! err { // has run. The dead state always has an ID of 0. i.e., It is always the // first state in a DFA. // * quit - A state that is entered whenever a byte is seen that should cause -// a DFA to give up and stop searching. This results in a MatchError::Quit +// a DFA to give up and stop searching. This results in a MatchError::quit // error being returned at search time. The default configuration for a DFA // has no quit bytes, which means this state is unreachable by default, // although it is always present for reasons of implementation simplicity. @@ -101,7 +101,7 @@ macro_rules! err { // # A quit state means we give up. If he DFA has no quit state, // # then special.quit_id == 0 == dead, which is handled by the // # conditional above. -// return Err(MatchError::Quit { byte, offset: offset - 1 }) +// return Err(MatchError::quit { byte, offset: offset - 1 }) // if special.min_match <= current_state <= special.max_match: // last_match = Some(offset) // if special.min_accel <= current_state <= special.max_accel: @@ -157,34 +157,34 @@ macro_rules! err { // |----------------------------|------------------------ // special non-special* #[derive(Clone, Copy, Debug)] -pub struct Special { +pub(crate) struct Special { /// The identifier of the last special state in a DFA. A state is special /// if and only if its identifier is less than or equal to `max`. - pub max: StateID, + pub(crate) max: StateID, /// The identifier of the quit state in a DFA. (There is no analogous field /// for the dead state since the dead state's ID is always zero, regardless /// of state ID size.) - pub quit_id: StateID, + pub(crate) quit_id: StateID, /// The identifier of the first match state. - pub min_match: StateID, + pub(crate) min_match: StateID, /// The identifier of the last match state. - pub max_match: StateID, + pub(crate) max_match: StateID, /// The identifier of the first accelerated state. - pub min_accel: StateID, + pub(crate) min_accel: StateID, /// The identifier of the last accelerated state. - pub max_accel: StateID, + pub(crate) max_accel: StateID, /// The identifier of the first start state. - pub min_start: StateID, + pub(crate) min_start: StateID, /// The identifier of the last start state. - pub max_start: StateID, + pub(crate) max_start: StateID, } impl Special { /// Creates a new set of special ranges for a DFA. All ranges are initially /// set to only contain the dead state. This is interpreted as an empty /// range. - #[cfg(feature = "alloc")] - pub fn new() -> Special { + #[cfg(feature = "dfa-build")] + pub(crate) fn new() -> Special { Special { max: DEAD, quit_id: DEAD, @@ -198,8 +198,8 @@ impl Special { } /// Remaps all of the special state identifiers using the function given. - #[cfg(feature = "alloc")] - pub fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { + #[cfg(feature = "dfa-build")] + pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { Special { max: map(self.max), quit_id: map(self.quit_id), @@ -220,14 +220,14 @@ impl Special { /// /// Upon success, this returns the number of bytes read in addition to the /// special state IDs themselves. - pub fn from_bytes( + pub(crate) fn from_bytes( mut slice: &[u8], ) -> Result<(Special, usize), DeserializeError> { - bytes::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; + wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; let mut nread = 0; let mut read_id = |what| -> Result<StateID, DeserializeError> { - let (id, nr) = bytes::try_read_state_id(slice, what)?; + let (id, nr) = wire::try_read_state_id(slice, what)?; nread += nr; slice = &slice[StateID::SIZE..]; Ok(id) @@ -259,7 +259,7 @@ impl Special { /// Validate that the information describing special states satisfies /// all known invariants. - pub fn validate(&self) -> Result<(), DeserializeError> { + pub(crate) fn validate(&self) -> Result<(), DeserializeError> { // Check that both ends of the range are DEAD or neither are. if self.min_match == DEAD && self.max_match != DEAD { err!("min_match is DEAD, but max_match is not"); @@ -329,18 +329,18 @@ impl Special { } /// Validate that the special state information is compatible with the - /// given state count. - pub fn validate_state_count( + /// given state len. + pub(crate) fn validate_state_len( &self, - count: usize, + len: usize, stride2: usize, ) -> Result<(), DeserializeError> { // We assume that 'validate' has already passed, so we know that 'max' - // is truly the max. So all we need to check is that the max state - // ID is less than the state ID count. The max legal value here is - // count-1, which occurs when there are no non-special states. - if (self.max.as_usize() >> stride2) >= count { - err!("max should not be greater than or equal to state count"); + // is truly the max. So all we need to check is that the max state ID + // is less than the state ID len. The max legal value here is len-1, + // which occurs when there are no non-special states. + if (self.max.as_usize() >> stride2) >= len { + err!("max should not be greater than or equal to state length"); } Ok(()) } @@ -350,11 +350,11 @@ impl Special { /// this will return an error. The number of bytes written is returned /// on success. The number of bytes written is guaranteed to be a multiple /// of 8. - pub fn write_to<E: Endian>( + pub(crate) fn write_to<E: Endian>( &self, dst: &mut [u8], ) -> Result<usize, SerializeError> { - use crate::util::bytes::write_state_id as write; + use crate::util::wire::write_state_id as write; if dst.len() < self.write_to_len() { return Err(SerializeError::buffer_too_small("special state ids")); @@ -384,14 +384,14 @@ impl Special { } /// Returns the total number of bytes written by `write_to`. - pub fn write_to_len(&self) -> usize { + pub(crate) fn write_to_len(&self) -> usize { 8 * StateID::SIZE } /// Sets the maximum special state ID based on the current values. This /// should be used once all possible state IDs are set. - #[cfg(feature = "alloc")] - pub fn set_max(&mut self) { + #[cfg(feature = "dfa-build")] + pub(crate) fn set_max(&mut self) { use core::cmp::max; self.max = max( self.quit_id, @@ -399,45 +399,62 @@ impl Special { ); } + /// Sets the maximum special state ID such that starting states are not + /// considered "special." This also marks the min/max starting states as + /// DEAD such that 'is_start_state' always returns false, even if the state + /// is actually a starting state. + /// + /// This is useful when there is no prefilter set. It will avoid + /// ping-ponging between the hot path in the DFA search code and the start + /// state handling code, which is typically only useful for executing a + /// prefilter. + #[cfg(feature = "dfa-build")] + pub(crate) fn set_no_special_start_states(&mut self) { + use core::cmp::max; + self.max = max(self.quit_id, max(self.max_match, self.max_accel)); + self.min_start = DEAD; + self.max_start = DEAD; + } + /// Returns true if and only if the given state ID is a special state. #[inline] - pub fn is_special_state(&self, id: StateID) -> bool { + pub(crate) fn is_special_state(&self, id: StateID) -> bool { id <= self.max } /// Returns true if and only if the given state ID is a dead state. #[inline] - pub fn is_dead_state(&self, id: StateID) -> bool { + pub(crate) fn is_dead_state(&self, id: StateID) -> bool { id == DEAD } /// Returns true if and only if the given state ID is a quit state. #[inline] - pub fn is_quit_state(&self, id: StateID) -> bool { + pub(crate) fn is_quit_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.quit_id == id } /// Returns true if and only if the given state ID is a match state. #[inline] - pub fn is_match_state(&self, id: StateID) -> bool { + pub(crate) fn is_match_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match } /// Returns true if and only if the given state ID is an accel state. #[inline] - pub fn is_accel_state(&self, id: StateID) -> bool { + pub(crate) fn is_accel_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel } /// Returns true if and only if the given state ID is a start state. #[inline] - pub fn is_start_state(&self, id: StateID) -> bool { + pub(crate) fn is_start_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start } /// Returns the total number of match states for a dense table based DFA. #[inline] - pub fn match_len(&self, stride: usize) -> usize { + pub(crate) fn match_len(&self, stride: usize) -> usize { if self.matches() { (self.max_match.as_usize() - self.min_match.as_usize() + stride) / stride @@ -448,13 +465,13 @@ impl Special { /// Returns true if and only if there is at least one match state. #[inline] - pub fn matches(&self) -> bool { + pub(crate) fn matches(&self) -> bool { self.min_match != DEAD } /// Returns the total number of accel states. - #[cfg(feature = "alloc")] - pub fn accel_len(&self, stride: usize) -> usize { + #[cfg(feature = "dfa-build")] + pub(crate) fn accel_len(&self, stride: usize) -> usize { if self.accels() { (self.max_accel.as_usize() - self.min_accel.as_usize() + stride) / stride @@ -465,13 +482,13 @@ impl Special { /// Returns true if and only if there is at least one accel state. #[inline] - pub fn accels(&self) -> bool { + pub(crate) fn accels(&self) -> bool { self.min_accel != DEAD } /// Returns true if and only if there is at least one start state. #[inline] - pub fn starts(&self) -> bool { + pub(crate) fn starts(&self) -> bool { self.min_start != DEAD } } diff --git a/vendor/regex-automata/src/dfa/start.rs b/vendor/regex-automata/src/dfa/start.rs new file mode 100644 index 000000000..fddc702df --- /dev/null +++ b/vendor/regex-automata/src/dfa/start.rs @@ -0,0 +1,74 @@ +use core::mem::size_of; + +use crate::util::wire::{self, DeserializeError, Endian, SerializeError}; + +/// The kind of anchored starting configurations to support in a DFA. +/// +/// Fully compiled DFAs need to be explicitly configured as to which anchored +/// starting configurations to support. The reason for not just supporting +/// everything unconditionally is that it can use more resources (such as +/// memory and build time). The downside of this is that if you try to execute +/// a search using an [`Anchored`](crate::Anchored) mode that is not supported +/// by the DFA, then the search will return an error. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum StartKind { + /// Support both anchored and unanchored searches. + Both, + /// Support only unanchored searches. Requesting an anchored search will + /// panic. + /// + /// Note that even if an unanchored search is requested, the pattern itself + /// may still be anchored. For example, `^abc` will only match `abc` at the + /// start of a haystack. This will remain true, even if the regex engine + /// only supported unanchored searches. + Unanchored, + /// Support only anchored searches. Requesting an unanchored search will + /// panic. + Anchored, +} + +impl StartKind { + pub(crate) fn from_bytes( + slice: &[u8], + ) -> Result<(StartKind, usize), DeserializeError> { + wire::check_slice_len(slice, size_of::<u32>(), "start kind bytes")?; + let (n, nr) = wire::try_read_u32(slice, "start kind integer")?; + match n { + 0 => Ok((StartKind::Both, nr)), + 1 => Ok((StartKind::Unanchored, nr)), + 2 => Ok((StartKind::Anchored, nr)), + _ => Err(DeserializeError::generic("unrecognized start kind")), + } + } + + pub(crate) fn write_to<E: Endian>( + &self, + dst: &mut [u8], + ) -> Result<usize, SerializeError> { + let nwrite = self.write_to_len(); + if dst.len() < nwrite { + return Err(SerializeError::buffer_too_small("start kind")); + } + let n = match *self { + StartKind::Both => 0, + StartKind::Unanchored => 1, + StartKind::Anchored => 2, + }; + E::write_u32(n, dst); + Ok(nwrite) + } + + pub(crate) fn write_to_len(&self) -> usize { + size_of::<u32>() + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn has_unanchored(&self) -> bool { + matches!(*self, StartKind::Both | StartKind::Unanchored) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub(crate) fn has_anchored(&self) -> bool { + matches!(*self, StartKind::Both | StartKind::Anchored) + } +} diff --git a/vendor/regex-automata/src/dfa/transducer.rs b/vendor/regex-automata/src/dfa/transducer.rs deleted file mode 100644 index 58b34e00a..000000000 --- a/vendor/regex-automata/src/dfa/transducer.rs +++ /dev/null @@ -1,207 +0,0 @@ -use crate::{ - dfa::{automaton::Automaton, dense, sparse}, - util::id::StateID, -}; - -impl<T: AsRef<[u32]>> fst::Automaton for dense::DFA<T> { - type State = StateID; - - #[inline] - fn start(&self) -> StateID { - self.start_state_forward(None, &[], 0, 0) - } - - #[inline] - fn is_match(&self, state: &StateID) -> bool { - self.is_match_state(*state) - } - - #[inline] - fn accept(&self, state: &StateID, byte: u8) -> StateID { - if fst::Automaton::is_match(self, state) { - return *state; - } - self.next_state(*state, byte) - } - - #[inline] - fn accept_eof(&self, state: &StateID) -> Option<StateID> { - if fst::Automaton::is_match(self, state) { - return Some(*state); - } - Some(self.next_eoi_state(*state)) - } - - #[inline] - fn can_match(&self, state: &StateID) -> bool { - !self.is_dead_state(*state) - } -} - -impl<T: AsRef<[u8]>> fst::Automaton for sparse::DFA<T> { - type State = StateID; - - #[inline] - fn start(&self) -> StateID { - self.start_state_forward(None, &[], 0, 0) - } - - #[inline] - fn is_match(&self, state: &StateID) -> bool { - self.is_match_state(*state) - } - - #[inline] - fn accept(&self, state: &StateID, byte: u8) -> StateID { - if fst::Automaton::is_match(self, state) { - return *state; - } - self.next_state(*state, byte) - } - - #[inline] - fn accept_eof(&self, state: &StateID) -> Option<StateID> { - if fst::Automaton::is_match(self, state) { - return Some(*state); - } - Some(self.next_eoi_state(*state)) - } - - #[inline] - fn can_match(&self, state: &StateID) -> bool { - !self.is_dead_state(*state) - } -} - -#[cfg(test)] -mod tests { - use bstr::BString; - use fst::{Automaton, IntoStreamer, Set, Streamer}; - - use crate::dfa::{dense, sparse}; - - fn search<A: Automaton, D: AsRef<[u8]>>( - set: &Set<D>, - aut: A, - ) -> Vec<BString> { - let mut stream = set.search(aut).into_stream(); - - let mut results = vec![]; - while let Some(key) = stream.next() { - results.push(BString::from(key)); - } - results - } - - #[test] - fn dense_anywhere() { - let set = - Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) - .unwrap(); - let dfa = dense::DFA::new("ba.*").unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); - } - - #[test] - fn dense_anchored() { - let set = - Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) - .unwrap(); - let dfa = dense::Builder::new() - .configure(dense::Config::new().anchored(true)) - .build("ba.*") - .unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["bar", "baz"]); - } - - #[test] - fn dense_assertions_start() { - let set = - Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) - .unwrap(); - let dfa = dense::Builder::new().build("^ba.*").unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["bar", "baz"]); - } - - #[test] - fn dense_assertions_end() { - let set = - Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"]) - .unwrap(); - let dfa = dense::Builder::new().build(".*x$").unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["bax", "xbax"]); - } - - #[test] - fn dense_assertions_word() { - let set = - Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap(); - let dfa = dense::Builder::new().build(r"(?-u)\bfoo\b").unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["foo", "zzz foo zzz"]); - } - - #[test] - fn sparse_anywhere() { - let set = - Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) - .unwrap(); - let dfa = sparse::DFA::new("ba.*").unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]); - } - - #[test] - fn sparse_anchored() { - let set = - Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) - .unwrap(); - let dfa = dense::Builder::new() - .configure(dense::Config::new().anchored(true)) - .build("ba.*") - .unwrap() - .to_sparse() - .unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["bar", "baz"]); - } - - #[test] - fn sparse_assertions_start() { - let set = - Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) - .unwrap(); - let dfa = - dense::Builder::new().build("^ba.*").unwrap().to_sparse().unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["bar", "baz"]); - } - - #[test] - fn sparse_assertions_end() { - let set = - Set::from_iter(&["a", "bar", "bax", "wat", "xba", "xbax", "z"]) - .unwrap(); - let dfa = - dense::Builder::new().build(".*x$").unwrap().to_sparse().unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["bax", "xbax"]); - } - - #[test] - fn sparse_assertions_word() { - let set = - Set::from_iter(&["foo", "foox", "xfoo", "zzz foo zzz"]).unwrap(); - let dfa = dense::Builder::new() - .build(r"(?-u)\bfoo\b") - .unwrap() - .to_sparse() - .unwrap(); - let got = search(&set, &dfa); - assert_eq!(got, vec!["foo", "zzz foo zzz"]); - } -} |