diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:18:32 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-17 12:18:32 +0000 |
commit | 4547b622d8d29df964fa2914213088b148c498fc (patch) | |
tree | 9fc6b25f3c3add6b745be9a2400a6e96140046e9 /vendor/regex-automata/src/hybrid | |
parent | Releasing progress-linux version 1.66.0+dfsg1-1~progress7.99u1. (diff) | |
download | rustc-4547b622d8d29df964fa2914213088b148c498fc.tar.xz rustc-4547b622d8d29df964fa2914213088b148c498fc.zip |
Merging upstream version 1.67.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/hybrid')
-rw-r--r-- | vendor/regex-automata/src/hybrid/dfa.rs | 3817 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/error.rs | 130 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/id.rs | 415 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/mod.rs | 179 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/regex.rs | 2124 | ||||
-rw-r--r-- | vendor/regex-automata/src/hybrid/search.rs | 663 |
6 files changed, 7328 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs new file mode 100644 index 000000000..1fbce5f5f --- /dev/null +++ b/vendor/regex-automata/src/hybrid/dfa.rs @@ -0,0 +1,3817 @@ +/*! +Types and routines specific to lazy DFAs. + +This module is the home of [`hybrid::dfa::DFA`](DFA). + +This module also contains a [`hybrid::dfa::Builder`](Builder) and a +[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA. +*/ + +use core::{borrow::Borrow, iter, mem::size_of}; + +use alloc::{sync::Arc, vec::Vec}; + +use crate::{ + hybrid::{ + error::{BuildError, CacheError}, + id::{LazyStateID, LazyStateIDError, OverlappingState}, + search, + }, + nfa::thompson, + util::{ + alphabet::{self, ByteClasses, ByteSet}, + determinize::{self, State, StateBuilderEmpty, StateBuilderNFA}, + id::{PatternID, StateID as NFAStateID}, + matchtypes::{HalfMatch, MatchError, MatchKind}, + prefilter, + sparse_set::SparseSets, + start::Start, + }, +}; + +/// The mininum number of states that a lazy DFA's cache size must support. +/// +/// This is checked at time of construction to ensure that at least some small +/// number of states can fit in the given capacity allotment. If we can't fit +/// at least this number of states, then the thinking is that it's pretty +/// senseless to use the lazy DFA. More to the point, parts of the code do +/// assume that the cache can fit at least some small number of states. +const MIN_STATES: usize = 5; + +/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching. +/// +/// A lazy DFA is a DFA that builds itself at search time. It otherwise has +/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA). +/// Indeed, both support precisely the same regex features with precisely the +/// same semantics. +/// +/// Where as a `dense::DFA` must be completely built to handle any input before +/// it may be used for search, a lazy DFA starts off effectively empty. During +/// a search, a lazy DFA will build itself depending on whether it has already +/// computed the next transition or not. If it has, then it looks a lot like +/// a `dense::DFA` internally: it does a very fast table based access to find +/// the next transition. Otherwise, if the state hasn't been computed, then it +/// does determinization _for that specific transition_ to compute the next DFA +/// state. +/// +/// The main selling point of a lazy DFA is that, in practice, it has +/// the performance profile of a `dense::DFA` without the weakness of it +/// taking worst case exponential time to build. Indeed, for each byte of +/// input, the lazy DFA will construct as most one new DFA state. Thus, a +/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~ +/// pattern.len()` and `n ~ haystack.len()`). +/// +/// The main downsides of a lazy DFA are: +/// +/// 1. It requires mutable "cache" space during search. This is where the +/// transition table, among other things, is stored. +/// 2. In pathological cases (e.g., if the cache is too small), it will run +/// out of room and either require a bigger cache capacity or will repeatedly +/// clear the cache and thus repeatedly regenerate DFA states. Overall, this +/// will tend to be slower than a typical NFA simulation. +/// +/// # Capabilities +/// +/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following +/// operations: +/// +/// 1. Detection of a match. +/// 2. Location of the end of a match. +/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched +/// is reported as well. +/// +/// A notable absence from the above list of capabilities is the location of +/// the *start* of a match. In order to provide both the start and end of +/// a match, *two* lazy DFAs are required. This functionality is provided by a +/// [`Regex`](crate::hybrid::regex::Regex). +/// +/// # Example +/// +/// This shows how to build a lazy DFA with the default configuration and +/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create +/// a cache and pass it to our search routine. +/// +/// ``` +/// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; +/// +/// let dfa = DFA::new("foo[0-9]+")?; +/// let mut cache = dfa.create_cache(); +/// +/// let expected = Some(HalfMatch::must(0, 8)); +/// assert_eq!(expected, dfa.find_leftmost_fwd(&mut cache, b"foo12345")?); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct DFA { + nfa: Arc<thompson::NFA>, + stride2: usize, + classes: ByteClasses, + quitset: ByteSet, + anchored: bool, + match_kind: MatchKind, + starts_for_each_pattern: bool, + cache_capacity: usize, + minimum_cache_clear_count: Option<usize>, +} + +impl DFA { + /// Parse the given regular expression using a default configuration and + /// return the corresponding lazy DFA. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::new("foo[0-9]+bar")?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!( + /// Some(expected), + /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?, + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<DFA, BuildError> { + DFA::builder().build(pattern) + } + + /// Parse the given regular expressions using a default configuration and + /// return the corresponding lazy multi-DFA. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(1, 3); + /// assert_eq!( + /// Some(expected), + /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?, + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { + DFA::builder().build_many(patterns) + } + + /// Create a new lazy DFA that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::always_match()?; + /// let mut cache = dfa.create_cache(); + /// + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"")?); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"foo")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::always_match(); + Builder::new().build_from_nfa(Arc::new(nfa)) + } + + /// Create a new lazy DFA that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::never_match()?; + /// let mut cache = dfa.create_cache(); + /// + /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"")?); + /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"foo")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<DFA, BuildError> { + let nfa = thompson::NFA::never_match(); + Builder::new().build_from_nfa(Arc::new(nfa)) + } + + /// Return a default configuration for a `DFA`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a lazy DFA. + /// + /// # Example + /// + /// This example shows how to build a lazy DFA that only executes searches + /// in anchored mode. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let re = DFA::builder() + /// .configure(DFA::config().anchored(true)) + /// .build(r"[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "abc123xyz".as_bytes(); + /// assert_eq!(None, re.find_leftmost_fwd(&mut cache, haystack)?); + /// assert_eq!( + /// Some(HalfMatch::must(0, 3)), + /// re.find_leftmost_fwd(&mut cache, &haystack[3..6])?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere for lazy DFAs. This includes disabling it for both the + /// concrete syntax (e.g., `.` matches any byte and Unicode character + /// classes like `\p{Letter}` are not allowed) and for the unanchored + /// search prefix. The latter enables the regex to match anywhere in a + /// sequence of arbitrary bytes. (Typically, the unanchored search prefix + /// will only permit matching valid UTF-8.) + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson, + /// HalfMatch, SyntaxConfig, + /// }; + /// + /// let re = DFA::builder() + /// .syntax(SyntaxConfig::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(HalfMatch::must(0, 9)); + /// let got = re.find_leftmost_fwd(&mut cache, haystack)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this lazy DFA. + /// + /// The cache returned should only be used for searches for this + /// lazy DFA. If you want to reuse the cache for another DFA, then + /// you must call [`Cache::reset`] with that DFA (or, equivalently, + /// [`DFA::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this lazy DFA (and only this DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different lazy DFA. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// lazy DFA has been configured to "give up" after it has cleared the + /// cache a certain number of times. + /// + /// Any lazy state ID generated by the cache prior to resetting it is + /// invalid after the reset. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different DFA. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa1 = DFA::new(r"\w")?; + /// let dfa2 = DFA::new(r"\W")?; + /// + /// let mut cache = dfa1.create_cache(); + /// assert_eq!( + /// Some(HalfMatch::must(0, 2)), + /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?, + /// ); + /// + /// // Using 'cache' with dfa2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 'dfa1' is also not + /// // allowed. + /// dfa2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(HalfMatch::must(0, 3)), + /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + Lazy::new(self, cache).reset_cache() + } + + /// Returns the total number of patterns compiled into this lazy DFA. + /// + /// In the case of a DFA that contains no patterns, this returns `0`. + /// + /// # Example + /// + /// This example shows the pattern count for a DFA that never matches: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::never_match()?; + /// assert_eq!(dfa.pattern_count(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a DFA that matches at every position: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::always_match()?; + /// assert_eq!(dfa.pattern_count(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a DFA that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(dfa.pattern_count(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_count(&self) -> usize { + self.nfa.pattern_len() + } + + /// Returns a reference to the underlying NFA. + pub fn nfa(&self) -> &Arc<thompson::NFA> { + &self.nfa + } + + /// Returns the stride, as a base-2 exponent, required for these + /// equivalence classes. + /// + /// The stride is always the smallest power of 2 that is greater than or + /// equal to the alphabet length. This is done so that converting between + /// state IDs and indices can be done with shifts alone, which is much + /// faster than integer division. + fn stride2(&self) -> usize { + self.stride2 + } + + /// Returns the total stride for every state in this lazy DFA. This + /// corresponds to the total number of transitions used by each state in + /// this DFA's transition table. + fn stride(&self) -> usize { + 1 << self.stride2() + } + + /// Returns the total number of elements in the alphabet for this + /// transition table. This is always less than or equal to `self.stride()`. + /// It is only equal when the alphabet length is a power of 2. Otherwise, + /// it is always strictly less. + fn alphabet_len(&self) -> usize { + self.classes.alphabet_len() + } + + /// Returns the memory usage, in bytes, of this lazy DFA. + /// + /// This does **not** include the stack size used up by this lazy DFA. To + /// compute that, use `std::mem::size_of::<DFA>()`. This also does + /// not include the size of the `Cache` used. + pub fn memory_usage(&self) -> usize { + // Everything else is on the stack. + self.nfa.memory_usage() + } +} + +impl DFA { + /// Executes a forward search and returns the end position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. This is useful for implementing boolean `is_match`-like + /// routines, where as little work is done as possible. + /// + /// See [`DFA::find_earliest_fwd_at`] for additional functionality, such as + /// providing a prefilter, a specific pattern to match and the bounds of + /// the search within the haystack. This routine is meant as a convenience + /// for common cases where the additional functionality is not needed. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example demonstrates how the position returned might differ from + /// what one might expect when executing a traditional leftmost search. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// // Normally, the end of the leftmost first match here would be 8, + /// // corresponding to the end of the input. But the "earliest" semantics + /// // this routine cause it to stop as soon as a match is known, which + /// // occurs once 'foo[0-9]' has matched. + /// let expected = HalfMatch::must(0, 4); + /// assert_eq!( + /// Some(expected), + /// dfa.find_earliest_fwd(&mut cache, b"foo12345")?, + /// ); + /// + /// let dfa = DFA::new("abc|a")?; + /// let mut cache = dfa.create_cache(); + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the shortest match semantics detect a match earlier. + /// let expected = HalfMatch::must(0, 1); + /// assert_eq!(Some(expected), dfa.find_earliest_fwd(&mut cache, b"abc")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_earliest_fwd( + &self, + cache: &mut Cache, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_earliest_fwd_at(cache, None, None, bytes, 0, bytes.len()) + } + + /// Executes a reverse search and returns the start position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. + /// + /// Note that while it is not technically necessary to build a reverse + /// automaton to use a reverse search, it is likely that you'll want to do + /// so. Namely, the typical use of a reverse search is to find the starting + /// location of a match once its end is discovered from a forward search. A + /// reverse DFA automaton can be built by configuring the intermediate NFA + /// to be reversed via + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example demonstrates how the position returned might differ from + /// what one might expect when executing a traditional leftmost reverse + /// search. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch}; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("[a-z]+[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// // Normally, the end of the leftmost first match here would be 0, + /// // corresponding to the beginning of the input. But the "earliest" + /// // semantics of this routine cause it to stop as soon as a match is + /// // known, which occurs once '[a-z][0-9]+' has matched. + /// let expected = HalfMatch::must(0, 2); + /// assert_eq!( + /// Some(expected), + /// dfa.find_earliest_rev(&mut cache, b"foo12345")?, + /// ); + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// let mut cache = dfa.create_cache(); + /// // Normally, the end of the leftmost first match here would be 0, + /// // but the shortest match semantics detect a match earlier. + /// let expected = HalfMatch::must(0, 2); + /// assert_eq!(Some(expected), dfa.find_earliest_rev(&mut cache, b"abc")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_earliest_rev( + &self, + cache: &mut Cache, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_earliest_rev_at(cache, None, bytes, 0, bytes.len()) + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// In particular, this method continues searching even after it enters + /// a match state. The search only terminates once it has reached the + /// end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::new("foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!( + /// Some(expected), + /// dfa.find_leftmost_fwd(&mut cache, b"foo12345")?, + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = DFA::new("abc|a")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 3); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"abc")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_leftmost_fwd( + &self, + cache: &mut Cache, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_leftmost_fwd_at(cache, None, None, bytes, 0, bytes.len()) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// In particular, this method continues searching even after it enters + /// a match state. The search only terminates once it has reached the + /// end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// In particular, this routine is principally + /// useful when used in conjunction with the + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::revers + /// e) configuration. In general, it's unlikely to be correct to use both + /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since + /// any particular DFA will only support searching in one direction with + /// respect to the pattern. + /// + /// ``` + /// use regex_automata::{nfa::thompson, hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!( + /// Some(expected), + /// dfa.find_leftmost_rev(&mut cache, b"foo12345")?, + /// ); + /// + /// // Even though a match is found after reading the last byte (`c`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over latter parts. + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build("abc|c")?; + /// let mut cache = dfa.create_cache(); + /// let expected = HalfMatch::must(0, 0); + /// assert_eq!(Some(expected), dfa.find_leftmost_rev(&mut cache, b"abc")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_leftmost_rev( + &self, + cache: &mut Cache, + bytes: &[u8], + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_leftmost_rev_at(cache, None, bytes, 0, bytes.len()) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to run a basic overlapping search. Notice + /// that we build the automaton with a `MatchKind::All` configuration. + /// Overlapping searches are unlikely to work as one would expect when + /// using the default `MatchKind::LeftmostFirst` match semantics, since + /// leftmost-first matching is fundamentally incompatible with overlapping + /// searches. Namely, overlapping searches need to report matches as they + /// are seen, where as leftmost-first searches will continue searching even + /// after a match has been observed in order to find the conventional end + /// position of the match. More concretely, leftmost-first searches use + /// dead states to terminate a search after a specific match can no longer + /// be extended. Overlapping searches instead do the opposite by continuing + /// the search to find totally new matches (potentially of other patterns). + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, OverlappingState}, + /// HalfMatch, + /// MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_overlapping_fwd( + &self, + cache: &mut Cache, + bytes: &[u8], + state: &mut OverlappingState, + ) -> Result<Option<HalfMatch>, MatchError> { + self.find_overlapping_fwd_at( + cache, + None, + None, + bytes, + 0, + bytes.len(), + state, + ) + } + + /// Executes a forward search and returns the end position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. This is useful for implementing boolean `is_match`-like + /// routines, where as little work is done as possible. + /// + /// This is like [`DFA::find_earliest_fwd`], except it provides some + /// additional control over how the search is executed: + /// + /// * `pre` is a prefilter scanner that, when given, is used whenever the + /// DFA enters its starting state. This is meant to speed up searches where + /// one or a small number of literal prefixes are known. + /// * `pattern_id` specifies a specific pattern in the DFA to run an + /// anchored search for. If not given, then a search for any pattern is + /// performed. For lazy DFAs, [`Config::starts_for_each_pattern`] must be + /// enabled to use this functionality. + /// * `start` and `end` permit searching a specific region of the haystack + /// `bytes`. This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `bytes`. (Because the existence of look-around + /// operations such as `\b`, `^` and `$` need to take the surrounding + /// context into account. This cannot be done if the haystack doesn't + /// contain it.) + /// + /// The examples below demonstrate each of these additional parameters. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and this lazy DFA does + /// not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + /// + /// # Example: prefilter + /// + /// This example shows how to provide a prefilter for a pattern where all + /// matches start with a `z` byte. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::prefilter::{Candidate, Prefilter, Scanner, State}, + /// HalfMatch, + /// }; + /// + /// #[derive(Debug)] + /// pub struct ZPrefilter; + /// + /// impl Prefilter for ZPrefilter { + /// fn next_candidate( + /// &self, + /// _: &mut State, + /// haystack: &[u8], + /// at: usize, + /// ) -> Candidate { + /// // Try changing b'z' to b'q' and observe this test fail since + /// // the prefilter will skip right over the match. + /// match haystack.iter().position(|&b| b == b'z') { + /// None => Candidate::None, + /// Some(i) => Candidate::PossibleStartOfMatch(at + i), + /// } + /// } + /// + /// fn heap_bytes(&self) -> usize { + /// 0 + /// } + /// } + /// + /// let dfa = DFA::new("z[0-9]{3}")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "foobar z123 q123".as_bytes(); + /// // A scanner executes a prefilter while tracking some state that helps + /// // determine whether a prefilter is still "effective" or not. + /// let mut scanner = Scanner::new(&ZPrefilter); + /// + /// let expected = Some(HalfMatch::must(0, 11)); + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// Some(&mut scanner), + /// None, + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a lazy multi-DFA that permits searching + /// for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// HalfMatch, + /// PatternID, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "foo123".as_bytes(); + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// None, + /// None, + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(HalfMatch::must(1, 6)); + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// None, + /// Some(PatternID::must(1)), + /// haystack, + /// 0, + /// haystack.len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// // N.B. We disable Unicode here so that we use a simple ASCII word + /// // boundary. Alternatively, we could enable heuristic support for + /// // Unicode word boundaries since our haystack is pure ASCII. + /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "foo123bar".as_bytes(); + /// + /// // Since we sub-slice the haystack, the search doesn't know about the + /// // larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `3` instead of `6`. + /// let expected = Some(HalfMatch::must(0, 3)); + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// None, + /// None, + /// &haystack[3..6], + /// 0, + /// haystack[3..6].len(), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let got = dfa.find_earliest_fwd_at( + /// &mut cache, + /// None, + /// None, + /// haystack, + /// 3, + /// 6, + /// )?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_earliest_fwd_at( + &self, + cache: &mut Cache, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_earliest_fwd( + pre, self, cache, pattern_id, bytes, start, end, + ) + } + + /// Executes a reverse search and returns the start position of the first + /// match that is found as early as possible. If no match exists, then + /// `None` is returned. + /// + /// This routine stops scanning input as soon as the search observes a + /// match state. + /// + /// This is like [`DFA::find_earliest_rev`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`DFA::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + #[inline] + pub fn find_earliest_rev_at( + &self, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_earliest_rev(self, cache, pattern_id, bytes, start, end) + } + + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. + /// + /// This is like [`DFA::find_leftmost_fwd`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the + /// additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + #[inline] + pub fn find_leftmost_fwd_at( + &self, + cache: &mut Cache, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_leftmost_fwd( + pre, self, cache, pattern_id, bytes, start, end, + ) + } + + /// Executes a reverse search and returns the start of the position of the + /// leftmost match that is found. If no match exists, then `None` is + /// returned. + /// + /// This is like [`DFA::find_leftmost_rev`], except it provides some + /// additional control over how the search is executed. See the + /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the + /// additional parameters along with examples of their usage. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + #[inline] + pub fn find_leftmost_rev_at( + &self, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_leftmost_rev(self, cache, pattern_id, bytes, start, end) + } + + /// Executes an overlapping forward search and returns the end position of + /// matches as they are found. If no match exists, then `None` is returned. + /// + /// This routine is principally only useful when searching for multiple + /// patterns on inputs where multiple patterns may match the same regions + /// of text. In particular, callers must preserve the automaton's search + /// state from prior calls so that the implementation knows where the last + /// match occurred. + /// + /// This is like [`DFA::find_overlapping_fwd`], except it provides + /// some additional control over how the search is executed. See the + /// documentation of [`DFA::find_earliest_fwd_at`] for more details + /// on the additional parameters along with examples of their usage. + /// + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should always be set to the end + /// of the last match. If more patterns match at the previous location, + /// then they will be immediately returned. (This is tracked by the given + /// overlapping state.) Otherwise, the search continues at the starting + /// position given. + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// lazy DFAs generated by this crate, this only occurs in non-default + /// configurations where quit bytes are used, Unicode word boundaries are + /// heuristically enabled or limits are set on the number of times the lazy + /// DFA's cache may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Panics + /// + /// This routine panics if a `pattern_id` is given and the underlying + /// DFA does not support specific pattern searches. + /// + /// It also panics if the given haystack range is not valid. + #[inline] + pub fn find_overlapping_fwd_at( + &self, + cache: &mut Cache, + pre: Option<&mut prefilter::Scanner>, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<HalfMatch>, MatchError> { + search::find_overlapping_fwd( + pre, self, cache, pattern_id, bytes, start, end, state, + ) + } +} + +impl DFA { + /// Transitions from the current state to the next state, given the next + /// byte of input. + /// + /// The given cache is used to either reuse pre-computed state + /// transitions, or to store this newly computed transition for future + /// reuse. Thus, this routine guarantees that it will never return a state + /// ID that has an "unknown" tag. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is the lazy state ID returned + /// by the most recent call to `next_state`, `next_state_untagged`, + /// `next_state_untagged_unchecked`, `start_state_forward` or + /// `state_state_reverse` for the given `cache`. Any state ID returned from + /// prior calls to these routines (with the same `cache`) is considered + /// invalid (even if it gives an appearance of working). State IDs returned + /// from _any_ prior call for different `cache` values are also always + /// invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID. Moreover, this routine is defined for all possible values of + /// `input`. + /// + /// These validity rules are not checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a lazy DFA for a given + /// haystack by using the `next_state` method. + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, None, haystack, 0, haystack.len(), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_state( + &self, + cache: &mut Cache, + current: LazyStateID, + input: u8, + ) -> Result<LazyStateID, CacheError> { + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_untagged() + class; + let sid = cache.trans[offset]; + if !sid.is_unknown() { + return Ok(sid); + } + let unit = alphabet::Unit::u8(input); + Lazy::new(self, cache).cache_next_state(current, unit) + } + + /// Transitions from the current state to the next state, given the next + /// byte of input and a state ID that is not tagged. + /// + /// The only reason to use this routine is performance. In particular, the + /// `next_state` method needs to do some additional checks, among them is + /// to account for identifiers to states that are not yet computed. In + /// such a case, the transition is computed on the fly. However, if it is + /// known that the `current` state ID is untagged, then these checks can be + /// omitted. + /// + /// Since this routine does not compute states on the fly, it does not + /// modify the cache and thus cannot return an error. Consequently, `cache` + /// does not need to be mutable and it is possible for this routine to + /// return a state ID corresponding to the special "unknown" state. In + /// this case, it is the caller's responsibility to use the prior state + /// ID and `input` with `next_state` in order to force the computation of + /// the unknown transition. Otherwise, trying to use the "unknown" state + /// ID will just result in transitioning back to itself, and thus never + /// terminating. (This is technically a special exemption to the state ID + /// validity rules, but is permissible since this routine is guarateed to + /// never mutate the given `cache`, and thus the identifier is guaranteed + /// to remain valid.) + /// + /// See [`LazyStateID`] for more details on what it means for a state ID + /// to be tagged. Also, see + /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked) + /// for this same idea, but with bounds checks forcefully elided. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is an **untagged** lazy + /// state ID returned by the most recent call to `next_state`, + /// `next_state_untagged`, `next_state_untagged_unchecked`, + /// `start_state_forward` or `state_state_reverse` for the given `cache`. + /// Any state ID returned from prior calls to these routines (with the + /// same `cache`) is considered invalid (even if it gives an appearance + /// of working). State IDs returned from _any_ prior call for different + /// `cache` values are also always invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID, although it may be tagged. Moreover, this routine is defined for + /// all possible values of `input`. + /// + /// Not all validity rules are checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a lazy DFA for a given + /// haystack by using the `next_state_untagged` method where possible. + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, None, haystack, 0, haystack.len(), + /// )?; + /// // Walk all the bytes in the haystack. + /// let mut at = 0; + /// while at < haystack.len() { + /// if sid.is_tagged() { + /// sid = dfa.next_state(&mut cache, sid, haystack[at])?; + /// } else { + /// let mut prev_sid = sid; + /// // We attempt to chew through as much as we can while moving + /// // through untagged state IDs. Thus, the transition function + /// // does less work on average per byte. (Unrolling this loop + /// // may help even more.) + /// while at < haystack.len() { + /// prev_sid = sid; + /// sid = dfa.next_state_untagged( + /// &mut cache, sid, haystack[at], + /// ); + /// at += 1; + /// if sid.is_tagged() { + /// break; + /// } + /// } + /// // We must ensure that we never proceed to the next iteration + /// // with an unknown state ID. If we don't account for this + /// // case, then search isn't guaranteed to terminate since all + /// // transitions on unknown states loop back to itself. + /// if sid.is_unknown() { + /// sid = dfa.next_state( + /// &mut cache, prev_sid, haystack[at - 1], + /// )?; + /// } + /// } + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk the + /// // special "EOI" transition at the end of the search. + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_state_untagged( + &self, + cache: &Cache, + current: LazyStateID, + input: u8, + ) -> LazyStateID { + debug_assert!(!current.is_tagged()); + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_unchecked() + class; + cache.trans[offset] + } + + /// Transitions from the current state to the next state, eliding bounds + /// checks, given the next byte of input and a state ID that is not tagged. + /// + /// The only reason to use this routine is performance. In particular, the + /// `next_state` method needs to do some additional checks, among them is + /// to account for identifiers to states that are not yet computed. In + /// such a case, the transition is computed on the fly. However, if it is + /// known that the `current` state ID is untagged, then these checks can be + /// omitted. + /// + /// Since this routine does not compute states on the fly, it does not + /// modify the cache and thus cannot return an error. Consequently, `cache` + /// does not need to be mutable and it is possible for this routine to + /// return a state ID corresponding to the special "unknown" state. In + /// this case, it is the caller's responsibility to use the prior state + /// ID and `input` with `next_state` in order to force the computation of + /// the unknown transition. Otherwise, trying to use the "unknown" state + /// ID will just result in transitioning back to itself, and thus never + /// terminating. (This is technically a special exemption to the state ID + /// validity rules, but is permissible since this routine is guarateed to + /// never mutate the given `cache`, and thus the identifier is guaranteed + /// to remain valid.) + /// + /// See [`LazyStateID`] for more details on what it means for a state ID + /// to be tagged. Also, see + /// [`next_state_untagged`](DFA::next_state_untagged) + /// for this same idea, but with memory safety guaranteed by retaining + /// bounds checks. + /// + /// # State identifier validity + /// + /// The only valid value for `current` is an **untagged** lazy + /// state ID returned by the most recent call to `next_state`, + /// `next_state_untagged`, `next_state_untagged_unchecked`, + /// `start_state_forward` or `state_state_reverse` for the given `cache`. + /// Any state ID returned from prior calls to these routines (with the + /// same `cache`) is considered invalid (even if it gives an appearance + /// of working). State IDs returned from _any_ prior call for different + /// `cache` values are also always invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID, although it may be tagged. Moreover, this routine is defined for + /// all possible values of `input`. + /// + /// Not all validity rules are checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Safety + /// + /// Callers of this method must guarantee that `current` refers to a valid + /// state ID according to the rules described above. If `current` is not a + /// valid state ID for this automaton, then calling this routine may result + /// in undefined behavior. + /// + /// If `current` is valid, then the ID returned is valid for all possible + /// values of `input`. + #[inline] + pub unsafe fn next_state_untagged_unchecked( + &self, + cache: &Cache, + current: LazyStateID, + input: u8, + ) -> LazyStateID { + debug_assert!(!current.is_tagged()); + let class = usize::from(self.classes.get(input)); + let offset = current.as_usize_unchecked() + class; + *cache.trans.get_unchecked(offset) + } + + /// Transitions from the current state to the next state for the special + /// EOI symbol. + /// + /// The given cache is used to either reuse pre-computed state + /// transitions, or to store this newly computed transition for future + /// reuse. Thus, this routine guarantees that it will never return a state + /// ID that has an "unknown" tag. + /// + /// This routine must be called at the end of every search in a correct + /// implementation of search. Namely, lazy DFAs in this crate delay matches + /// by one byte in order to support look-around operators. Thus, after + /// reaching the end of a haystack, a search implementation must follow one + /// last EOI transition. + /// + /// It is best to think of EOI as an additional symbol in the alphabet of a + /// DFA that is distinct from every other symbol. That is, the alphabet of + /// lazy DFAs in this crate has a logical size of 257 instead of 256, where + /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the + /// physical alphabet size may be smaller because of alphabet compression + /// via equivalence classes, but EOI is always represented somehow in the + /// alphabet.) + /// + /// # State identifier validity + /// + /// The only valid value for `current` is the lazy state ID returned + /// by the most recent call to `next_state`, `next_state_untagged`, + /// `next_state_untagged_unchecked`, `start_state_forward` or + /// `state_state_reverse` for the given `cache`. Any state ID returned from + /// prior calls to these routines (with the same `cache`) is considered + /// invalid (even if it gives an appearance of working). State IDs returned + /// from _any_ prior call for different `cache` values are also always + /// invalid. + /// + /// The returned ID is always a valid ID when `current` refers to a valid + /// ID. + /// + /// These validity rules are not checked, even in debug mode. Callers are + /// required to uphold these rules themselves. + /// + /// Violating these state ID validity rules will not sacrifice memory + /// safety, but _may_ produce an incorrect result or a panic. + /// + /// # Panics + /// + /// If the given ID does not refer to a valid state, then this routine + /// may panic but it also may not panic and instead return an invalid or + /// incorrect ID. + /// + /// # Example + /// + /// This shows a simplistic example for walking a DFA for a given haystack, + /// and then finishing the search with the final EOI transition. + /// + /// ``` + /// use regex_automata::hybrid::dfa::DFA; + /// + /// let dfa = DFA::new(r"[a-z]+r")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, None, haystack, 0, haystack.len(), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// // Matches are always delayed by 1 byte, so we must explicitly walk + /// // the special "EOI" transition at the end of the search. Without this + /// // final transition, the assert below will fail since the DFA will not + /// // have entered a match state yet! + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// assert!(sid.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn next_eoi_state( + &self, + cache: &mut Cache, + current: LazyStateID, + ) -> Result<LazyStateID, CacheError> { + let eoi = self.classes.eoi().as_usize(); + let offset = current.as_usize_untagged() + eoi; + let sid = cache.trans[offset]; + if !sid.is_unknown() { + return Ok(sid); + } + let unit = self.classes.eoi(); + Lazy::new(self, cache).cache_next_state(current, unit) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// forward search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The pattern ID, if present. When the underlying DFA has been + /// configured with multiple patterns _and_ the DFA has been configured to + /// build an anchored start state for each pattern, then a pattern ID may + /// be specified to execute an anchored search for that specific pattern. + /// If `pattern_id` is invalid or if the DFA isn't configured to build + /// start states for each pattern, then implementations must panic. DFAs in + /// this crate can be configured to build start states for each pattern via + /// [`Config::starts_for_each_pattern`]. + /// * When `start > 0`, the byte at index `start - 1` may influence the + /// start state if the regex uses `^` or `\b`. + /// * Similarly, when `start == 0`, it may influence the start state when + /// the regex uses `^` or `\A`. + /// * Currently, `end` is unused. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for forward searches. + /// + /// # Panics + /// + /// This panics if `start..end` is not a valid sub-slice of `bytes`. This + /// also panics if `pattern_id` is non-None and does not refer to a valid + /// pattern, or if the DFA was not configured to build anchored start + /// states for each pattern. + #[inline] + pub fn start_state_forward( + &self, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<LazyStateID, CacheError> { + let mut lazy = Lazy::new(self, cache); + let start_type = Start::from_position_fwd(bytes, start, end); + let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type); + if !sid.is_unknown() { + return Ok(sid); + } + lazy.cache_start_group(pattern_id, start_type) + } + + /// Return the ID of the start state for this lazy DFA when executing a + /// reverse search. + /// + /// Unlike typical DFA implementations, the start state for DFAs in this + /// crate is dependent on a few different factors: + /// + /// * The pattern ID, if present. When the underlying DFA has been + /// configured with multiple patterns _and_ the DFA has been configured to + /// build an anchored start state for each pattern, then a pattern ID may + /// be specified to execute an anchored search for that specific pattern. + /// If `pattern_id` is invalid or if the DFA isn't configured to build + /// start states for each pattern, then implementations must panic. DFAs in + /// this crate can be configured to build start states for each pattern via + /// [`Config::starts_for_each_pattern`]. + /// * When `end < bytes.len()`, the byte at index `end` may influence the + /// start state if the regex uses `$` or `\b`. + /// * Similarly, when `end == bytes.len()`, it may influence the start + /// state when the regex uses `$` or `\z`. + /// * Currently, `start` is unused. + /// * Whether the search is a forward or reverse search. This routine can + /// only be used for reverse searches. + /// + /// # Panics + /// + /// This panics if `start..end` is not a valid sub-slice of `bytes`. This + /// also panics if `pattern_id` is non-None and does not refer to a valid + /// pattern, or if the DFA was not configured to build anchored start + /// states for each pattern. + #[inline] + pub fn start_state_reverse( + &self, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + ) -> Result<LazyStateID, CacheError> { + let mut lazy = Lazy::new(self, cache); + let start_type = Start::from_position_rev(bytes, start, end); + let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type); + if !sid.is_unknown() { + return Ok(sid); + } + lazy.cache_start_group(pattern_id, start_type) + } + + /// Returns the total number of patterns that match in this state. + /// + /// If the lazy DFA was compiled with one pattern, then this must + /// necessarily always return `1` for all match states. + /// + /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with + /// indices up to (but not including) the count returned by this routine + /// without panicking. + /// + /// If the given state is not a match state, then this may either panic + /// or return an incorrect result. + /// + /// # Example + /// + /// This example shows a simple instance of implementing overlapping + /// matches. In particular, it shows not only how to determine how many + /// patterns have matched in a particular state, but also how to access + /// which specific patterns have matched. + /// + /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All) + /// when building the DFA. If we used + /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst) + /// instead, then the DFA would not be constructed in a way that supports + /// overlapping matches. (It would only report a single pattern that + /// matches at any particular point in time.) + /// + /// Another thing to take note of is the patterns used and the order in + /// which the pattern IDs are reported. In the example below, pattern `3` + /// is yielded first. Why? Because it corresponds to the match that + /// appears first. Namely, the `@` symbol is part of `\S+` but not part + /// of any of the other patterns. Since the `\S+` pattern has a match that + /// starts to the left of any other pattern, its ID is returned before any + /// other. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, MatchKind}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[ + /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+", + /// ])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "@bar".as_bytes(); + /// + /// // The start state is determined by inspecting the position and the + /// // initial bytes of the haystack. + /// let mut sid = dfa.start_state_forward( + /// &mut cache, None, haystack, 0, haystack.len(), + /// )?; + /// // Walk all the bytes in the haystack. + /// for &b in haystack { + /// sid = dfa.next_state(&mut cache, sid, b)?; + /// } + /// sid = dfa.next_eoi_state(&mut cache, sid)?; + /// + /// assert!(sid.is_match()); + /// assert_eq!(dfa.match_count(&mut cache, sid), 3); + /// // The following calls are guaranteed to not panic since `match_count` + /// // returned `3` above. + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3); + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0); + /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn match_count(&self, cache: &Cache, id: LazyStateID) -> usize { + assert!(id.is_match()); + LazyRef::new(self, cache).get_cached_state(id).match_count() + } + + /// Returns the pattern ID corresponding to the given match index in the + /// given state. + /// + /// See [`DFA::match_count`] for an example of how to use this method + /// correctly. Note that if you know your lazy DFA is configured with a + /// single pattern, then this routine is never necessary since it will + /// always return a pattern ID of `0` for an index of `0` when `id` + /// corresponds to a match state. + /// + /// Typically, this routine is used when implementing an overlapping + /// search, as the example for `DFA::match_count` does. + /// + /// # Panics + /// + /// If the state ID is not a match state or if the match index is out + /// of bounds for the given state, then this routine may either panic + /// or produce an incorrect result. If the state ID is correct and the + /// match index is correct, then this routine always produces a valid + /// `PatternID`. + #[inline] + pub fn match_pattern( + &self, + cache: &Cache, + id: LazyStateID, + match_index: usize, + ) -> PatternID { + // This is an optimization for the very common case of a DFA with a + // single pattern. This conditional avoids a somewhat more costly path + // that finds the pattern ID from the corresponding `State`, which + // requires a bit of slicing/pointer-chasing. This optimization tends + // to only matter when matches are frequent. + if self.pattern_count() == 1 { + return PatternID::ZERO; + } + LazyRef::new(self, cache) + .get_cached_state(id) + .match_pattern(match_index) + } +} + +/// A cache represents a partially computed DFA. +/// +/// A cache is the key component that differentiates a classical DFA and a +/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a +/// complete transition table that can handle all possible inputs, a hybrid +/// NFA/DFA starts with an empty transition table and builds only the parts +/// required during search. The parts that are built are stored in a cache. For +/// this reason, a cache is a required parameter for nearly every operation on +/// a [`DFA`]. +/// +/// Caches can be created from their corresponding DFA via +/// [`DFA::create_cache`]. A cache can only be used with either the DFA that +/// created it, or the DFA that was most recently used to reset it with +/// [`Cache::reset`]. Using a cache with any other DFA may result in panics +/// or incorrect results. +#[derive(Clone, Debug)] +pub struct Cache { + // N.B. If you're looking to understand how determinization works, it + // is probably simpler to first grok src/dfa/determinize.rs, since that + // doesn't have the "laziness" component. + /// The transition table. + /// + /// Given a `current` LazyStateID and an `input` byte, the next state can + /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice + /// that no multiplication is used. That's because state identifiers are + /// "premultiplied." + /// + /// Note that the next state may be the "unknown" state. In this case, the + /// next state is not known and determinization for `current` on `input` + /// must be performed. + trans: Vec<LazyStateID>, + /// The starting states for this DFA. + /// + /// These are computed lazily. Initially, these are all set to "unknown" + /// lazy state IDs. + /// + /// When 'starts_for_each_pattern' is disabled (the default), then the size + /// of this is constrained to the possible starting configurations based + /// on the search parameters. (At time of writing, that's 4.) However, + /// when starting states for each pattern is enabled, then there are N + /// additional groups of starting states, where each group reflects the + /// different possible configurations and N is the number of patterns. + starts: Vec<LazyStateID>, + /// A sequence of NFA/DFA powerset states that have been computed for this + /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every + /// tagged LazyStateID can be used to index this sequence by converting it + /// to its untagged form.) + states: Vec<State>, + /// A map from states to their corresponding IDs. This map may be accessed + /// via the raw byte representation of a state, which means that a `State` + /// does not need to be allocated to determine whether it already exists + /// in this map. Indeed, the existence of such a state is what determines + /// whether we allocate a new `State` or not. + /// + /// The higher level idea here is that we do just enough determinization + /// for a state to check whether we've already computed it. If we have, + /// then we can save a little (albeit not much) work. The real savings is + /// in memory usage. If we never checked for trivially duplicate states, + /// then our memory usage would explode to unreasonable levels. + states_to_id: StateMap, + /// Sparse sets used to track which NFA states have been visited during + /// various traversals. + sparses: SparseSets, + /// Scratch space for traversing the NFA graph. (We use space on the heap + /// instead of the call stack.) + stack: Vec<NFAStateID>, + /// Scratch space for building a NFA/DFA powerset state. This is used to + /// help amortize allocation since not every powerset state generated is + /// added to the cache. In particular, if it already exists in the cache, + /// then there is no need to allocate a new `State` for it. + scratch_state_builder: StateBuilderEmpty, + /// A simple abstraction for handling the saving of at most a single state + /// across a cache clearing. This is required for correctness. Namely, if + /// adding a new state after clearing the cache fails, then the caller + /// must retain the ability to continue using the state ID given. The + /// state corresponding to the state ID is what we preserve across cache + /// clearings. + state_saver: StateSaver, + /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We + /// track this as new states are added since states use a variable amount + /// of heap. Tracking this as we add states makes it possible to compute + /// the total amount of memory used by the determinizer in constant time. + memory_usage_state: usize, + /// The number of times the cache has been cleared. When a minimum cache + /// clear count is set, then the cache will return an error instead of + /// clearing the cache if the count has been exceeded. + clear_count: usize, +} + +impl Cache { + /// Create a new cache for the given lazy DFA. + /// + /// The cache returned should only be used for searches for the given DFA. + /// If you want to reuse the cache for another DFA, then you must call + /// [`Cache::reset`] with that DFA. + pub fn new(dfa: &DFA) -> Cache { + let mut cache = Cache { + trans: alloc::vec![], + starts: alloc::vec![], + states: alloc::vec![], + states_to_id: StateMap::new(), + sparses: SparseSets::new(dfa.nfa.len()), + stack: alloc::vec![], + scratch_state_builder: StateBuilderEmpty::new(), + state_saver: StateSaver::none(), + memory_usage_state: 0, + clear_count: 0, + }; + Lazy { dfa, cache: &mut cache }.init_cache(); + cache + } + + /// Reset this cache such that it can be used for searching with the given + /// lazy DFA (and only that DFA). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different lazy DFA. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// lazy DFA has been configured to "give up" after it has cleared the + /// cache a certain number of times. + /// + /// Any lazy state ID generated by the cache prior to resetting it is + /// invalid after the reset. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different DFA. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let dfa1 = DFA::new(r"\w")?; + /// let dfa2 = DFA::new(r"\W")?; + /// + /// let mut cache = dfa1.create_cache(); + /// assert_eq!( + /// Some(HalfMatch::must(0, 2)), + /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?, + /// ); + /// + /// // Using 'cache' with dfa2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the DFA we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 'dfa1' is also not + /// // allowed. + /// cache.reset(&dfa2); + /// assert_eq!( + /// Some(HalfMatch::must(0, 3)), + /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, dfa: &DFA) { + Lazy::new(dfa, self).reset_cache() + } + + /// Returns the total number of times this cache has been cleared since it + /// was either created or last reset. + /// + /// This is useful for informational purposes or if you want to change + /// search strategies based on the number of times the cache has been + /// cleared. + pub fn clear_count(&self) -> usize { + self.clear_count + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + const STATE_SIZE: usize = size_of::<State>(); + + self.trans.len() * ID_SIZE + + self.starts.len() * ID_SIZE + + self.states.len() * STATE_SIZE + // Maps likely use more memory than this, but it's probably close. + + self.states_to_id.len() * (STATE_SIZE + ID_SIZE) + + self.sparses.memory_usage() + + self.stack.capacity() * ID_SIZE + + self.scratch_state_builder.capacity() + // Heap memory used by 'State' in both 'states' and 'states_to_id'. + + self.memory_usage_state + } +} + +/// A map from states to state identifiers. When using std, we use a standard +/// hashmap, since it's a bit faster for this use case. (Other maps, like +/// one's based on FNV, have not yet been benchmarked.) +/// +/// The main purpose of this map is to reuse states where possible. This won't +/// fully minimize the DFA, but it works well in a lot of cases. +#[cfg(feature = "std")] +type StateMap = std::collections::HashMap<State, LazyStateID>; +#[cfg(not(feature = "std"))] +type StateMap = alloc::collections::BTreeMap<State, LazyStateID>; + +/// A type that groups methods that require the base NFA/DFA and writable +/// access to the cache. +#[derive(Debug)] +struct Lazy<'i, 'c> { + dfa: &'i DFA, + cache: &'c mut Cache, +} + +impl<'i, 'c> Lazy<'i, 'c> { + /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. + fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> { + Lazy { dfa, cache } + } + + /// Return an immutable view by downgrading a writable cache to a read-only + /// cache. + fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> { + LazyRef::new(self.dfa, self.cache) + } + + /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA' + /// like 'next_state' and 'next_eoi_state' that are called in critical + /// areas. The idea is to let the optimizer focus on the other areas of + /// those methods as the hot path. + /// + /// Here's an example that justifies 'inline(never)' + /// + /// ```ignore + /// regex-cli find hybrid dfa \ + /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000 + /// ``` + /// + /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every + /// codepoint, in sequence, repeated 100 times. + /// + /// With 'inline(never)' hyperfine reports 1.1s per run. With + /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement. + #[inline(never)] + fn cache_next_state( + &mut self, + mut current: LazyStateID, + unit: alphabet::Unit, + ) -> Result<LazyStateID, CacheError> { + let stride2 = self.dfa.stride2(); + let empty_builder = self.get_state_builder(); + let builder = determinize::next( + &self.dfa.nfa, + self.dfa.match_kind, + &mut self.cache.sparses, + &mut self.cache.stack, + &self.cache.states[current.as_usize_untagged() >> stride2], + unit, + empty_builder, + ); + let save_state = !self.as_ref().state_builder_fits_in_cache(&builder); + if save_state { + self.save_state(current); + } + let next = self.add_builder_state(builder, |sid| sid)?; + if save_state { + current = self.saved_state_id(); + } + // This is the payoff. The next time 'next_state' is called with this + // state and alphabet unit, it will find this transition and avoid + // having to re-determinize this transition. + self.set_transition(current, unit, next); + Ok(next) + } + + /// Compute and cache the starting state for the given pattern ID (if + /// present) and the starting configuration. + /// + /// This panics if a pattern ID is given and the DFA isn't configured to + /// build anchored start states for each pattern. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn cache_start_group( + &mut self, + pattern_id: Option<PatternID>, + start: Start, + ) -> Result<LazyStateID, CacheError> { + let nfa_start_id = match pattern_id { + Some(pid) => { + assert!( + self.dfa.starts_for_each_pattern, + "attempted to search for a specific pattern \ + without enabling starts_for_each_pattern", + ); + self.dfa.nfa.start_pattern(pid) + } + None if self.dfa.anchored => self.dfa.nfa.start_anchored(), + None => self.dfa.nfa.start_unanchored(), + }; + + let id = self.cache_start_one(nfa_start_id, start)?; + self.set_start_state(pattern_id, start, id); + Ok(id) + } + + /// Compute and cache the starting state for the given NFA state ID and the + /// starting configuration. The NFA state ID might be one of the following: + /// + /// 1) An unanchored start state to match any pattern. + /// 2) An anchored start state to match any pattern. + /// 3) An anchored start state for a particular pattern. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn cache_start_one( + &mut self, + nfa_start_id: NFAStateID, + start: Start, + ) -> Result<LazyStateID, CacheError> { + let mut builder_matches = self.get_state_builder().into_matches(); + determinize::set_lookbehind_from_start(&start, &mut builder_matches); + self.cache.sparses.set1.clear(); + determinize::epsilon_closure( + self.dfa.nfa.borrow(), + nfa_start_id, + *builder_matches.look_have(), + &mut self.cache.stack, + &mut self.cache.sparses.set1, + ); + let mut builder = builder_matches.into_nfa(); + determinize::add_nfa_states( + self.dfa.nfa.borrow(), + &self.cache.sparses.set1, + &mut builder, + ); + self.add_builder_state(builder, |id| id.to_start()) + } + + /// Either add the given builder state to this cache, or return an ID to an + /// equivalent state already in this cache. + /// + /// In the case where no equivalent state exists, the idmap function given + /// may be used to transform the identifier allocated. This is useful if + /// the caller needs to tag the ID with additional information. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn add_builder_state( + &mut self, + builder: StateBuilderNFA, + idmap: impl Fn(LazyStateID) -> LazyStateID, + ) -> Result<LazyStateID, CacheError> { + if let Some(&cached_id) = + self.cache.states_to_id.get(builder.as_bytes()) + { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + self.put_state_builder(builder); + return Ok(cached_id); + } + let result = self.add_state(builder.to_state(), idmap); + self.put_state_builder(builder); + result + } + + /// Allocate a new state ID and add the given state to this cache. + /// + /// The idmap function given may be used to transform the identifier + /// allocated. This is useful if the caller needs to tag the ID with + /// additional information. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn add_state( + &mut self, + state: State, + idmap: impl Fn(LazyStateID) -> LazyStateID, + ) -> Result<LazyStateID, CacheError> { + if !self.as_ref().state_fits_in_cache(&state) { + self.try_clear_cache()?; + } + // It's important for this to come second, since the above may clear + // the cache. If we clear the cache after ID generation, then the ID + // is likely bunk since it would have been generated based on a larger + // transition table. + let mut id = idmap(self.next_state_id()?); + if state.is_match() { + id = id.to_match(); + } + // Add room in the transition table. Since this is a fresh state, all + // of its transitions are unknown. + self.cache.trans.extend( + iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()), + ); + // When we add a sentinel state, we never want to set any quit + // transitions. Technically, this is harmless, since sentinel states + // have all of their transitions set to loop back to themselves. But + // when creating sentinel states before the quit sentinel state, + // this will try to call 'set_transition' on a state ID that doesn't + // actually exist yet, which isn't allowed. So we just skip doing so + // entirely. + if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) { + let quit_id = self.as_ref().quit_id(); + for b in self.dfa.quitset.iter() { + self.set_transition(id, alphabet::Unit::u8(b), quit_id); + } + } + self.cache.memory_usage_state += state.memory_usage(); + self.cache.states.push(state.clone()); + self.cache.states_to_id.insert(state, id); + Ok(id) + } + + /// Allocate a new state ID. + /// + /// This will never return an unknown lazy state ID. + /// + /// If caching this state would otherwise result in a cache that has been + /// cleared too many times, then an error is returned. + fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> { + let sid = match LazyStateID::new(self.cache.trans.len()) { + Ok(sid) => sid, + Err(_) => { + self.try_clear_cache()?; + // This has to pass since we check that ID capacity at + // construction time can fit at least MIN_STATES states. + LazyStateID::new(self.cache.trans.len()).unwrap() + } + }; + Ok(sid) + } + + /// Attempt to clear the cache used by this lazy DFA. + /// + /// If clearing the cache exceeds the minimum number of required cache + /// clearings, then this will return a cache error. In this case, + /// callers should bubble this up as the cache can't be used until it is + /// reset. Implementations of search should convert this error into a + /// `MatchError::GaveUp`. + /// + /// If 'self.state_saver' is set to save a state, then this state is + /// persisted through cache clearing. Otherwise, the cache is returned to + /// its state after initialization with two exceptions: its clear count + /// is incremented and some of its memory likely has additional capacity. + /// That is, clearing a cache does _not_ release memory. + /// + /// Otherwise, any lazy state ID generated by the cache prior to resetting + /// it is invalid after the reset. + fn try_clear_cache(&mut self) -> Result<(), CacheError> { + // Currently, the only heuristic we use is the minimum cache clear + // count. If we pass that minimum, then we give up. + // + // It would be good to also add a heuristic based on "bytes searched + // per generated state," but this requires API design work. Namely, + // we really do not want to add a counter increment to the transition + // function, which implies we need to expose APIs to update the number + // of bytes searched by implementers of the search routines. And that + // doesn't seem great... But we should do it if this heuristic isn't + // enough. (The original lazy DFA implementation in the 'regex' crate + // had this heuristic, since the lazy DFA was coupled with the search + // routines.) + if let Some(min_count) = self.dfa.minimum_cache_clear_count { + if self.cache.clear_count >= min_count { + return Err(CacheError::too_many_cache_clears()); + } + } + self.clear_cache(); + Ok(()) + } + + /// Clears _and_ resets the cache. Resetting the cache means that no + /// states are persisted and the clear count is reset to 0. No heap memory + /// is released. + /// + /// Note that the caller may reset a cache with a different DFA than what + /// it was created from. In which case, the cache can now be used with the + /// new DFA (and not the old DFA). + fn reset_cache(&mut self) { + self.cache.state_saver = StateSaver::none(); + self.clear_cache(); + // If a new DFA is used, it might have a different number of NFA + // states, so we need to make sure our sparse sets have the appropriate + // size. + self.cache.sparses.resize(self.dfa.nfa.len()); + self.cache.clear_count = 0; + } + + /// Clear the cache used by this lazy DFA. + /// + /// If clearing the cache exceeds the minimum number of required cache + /// clearings, then this will return a cache error. In this case, + /// callers should bubble this up as the cache can't be used until it is + /// reset. Implementations of search should convert this error into a + /// `MatchError::GaveUp`. + /// + /// If 'self.state_saver' is set to save a state, then this state is + /// persisted through cache clearing. Otherwise, the cache is returned to + /// its state after initialization with two exceptions: its clear count + /// is incremented and some of its memory likely has additional capacity. + /// That is, clearing a cache does _not_ release memory. + /// + /// Otherwise, any lazy state ID generated by the cache prior to resetting + /// it is invalid after the reset. + fn clear_cache(&mut self) { + self.cache.trans.clear(); + self.cache.starts.clear(); + self.cache.states.clear(); + self.cache.states_to_id.clear(); + self.cache.memory_usage_state = 0; + self.cache.clear_count += 1; + trace!( + "lazy DFA cache has been cleared (count: {})", + self.cache.clear_count + ); + self.init_cache(); + // If the state we want to save is one of the sentinel + // (unknown/dead/quit) states, then 'init_cache' adds those back, and + // their identifier values remains invariant. So there's no need to add + // it again. (And indeed, doing so would be incorrect!) + if let Some((old_id, state)) = self.cache.state_saver.take_to_save() { + // If the state is one of the special sentinel states, then it is + // automatically added by cache initialization and its ID always + // remains the same. With that said, this should never occur since + // the sentinel states are all loop states back to themselves. So + // we should never be in a position where we're attempting to save + // a sentinel state since we never compute transitions out of a + // sentinel state. + assert!( + !self.as_ref().is_sentinel(old_id), + "cannot save sentinel state" + ); + let new_id = self + .add_state(state, |id| { + if old_id.is_start() { + id.to_start() + } else { + id + } + }) + // The unwrap here is OK because lazy DFA creation ensures that + // we have room in the cache to add MIN_STATES states. Since + // 'init_cache' above adds 3, this adds a 4th. + .expect("adding one state after cache clear must work"); + self.cache.state_saver = StateSaver::Saved(new_id); + } + } + + /// Initialize this cache from emptiness to a place where it can be used + /// for search. + /// + /// This is called both at cache creation time and after the cache has been + /// cleared. + /// + /// Primarily, this adds the three sentinel states and allocates some + /// initial memory. + fn init_cache(&mut self) { + let mut starts_len = Start::count(); + if self.dfa.starts_for_each_pattern { + starts_len += Start::count() * self.dfa.pattern_count(); + } + self.cache + .starts + .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len)); + // This is the set of NFA states that corresponds to each of our three + // sentinel states: the empty set. + let dead = State::dead(); + // This sets up some states that we use as sentinels that are present + // in every DFA. While it would be technically possible to implement + // this DFA without explicitly putting these states in the transition + // table, this is convenient to do to make `next_state` correct for all + // valid state IDs without needing explicit conditionals to special + // case these sentinel states. + // + // All three of these states are "dead" states. That is, all of + // them transition only to themselves. So once you enter one of + // these states, it's impossible to leave them. Thus, any correct + // search routine must explicitly check for these state types. (Sans + // `unknown`, since that is only used internally to represent missing + // states.) + let unk_id = + self.add_state(dead.clone(), |id| id.to_unknown()).unwrap(); + let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap(); + let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap(); + assert_eq!(unk_id, self.as_ref().unknown_id()); + assert_eq!(dead_id, self.as_ref().dead_id()); + assert_eq!(quit_id, self.as_ref().quit_id()); + // The idea here is that if you start in an unknown/dead/quit state and + // try to transition on them, then you should end up where you started. + self.set_all_transitions(unk_id, unk_id); + self.set_all_transitions(dead_id, dead_id); + self.set_all_transitions(quit_id, quit_id); + // All of these states are technically equivalent from the FSM + // perspective, so putting all three of them in the cache isn't + // possible. (They are distinct merely because we use their + // identifiers as sentinels to mean something, as indicated by the + // names.) Moreover, we wouldn't want to do that. Unknown and quit + // states are special in that they are artificial constructions + // this implementation. But dead states are a natural part of + // determinization. When you reach a point in the NFA where you cannot + // go anywhere else, a dead state will naturally arise and we MUST + // reuse the canonical dead state that we've created here. Why? Because + // it is the state ID that tells the search routine whether a state is + // dead or not, and thus, whether to stop the search. Having a bunch of + // distinct dead states would be quite wasteful! + self.cache.states_to_id.insert(dead, dead_id); + } + + /// Save the state corresponding to the ID given such that the state + /// persists through a cache clearing. + /// + /// While the state may persist, the ID may not. In order to discover the + /// new state ID, one must call 'saved_state_id' after a cache clearing. + fn save_state(&mut self, id: LazyStateID) { + let state = self.as_ref().get_cached_state(id).clone(); + self.cache.state_saver = StateSaver::ToSave { id, state }; + } + + /// Returns the updated lazy state ID for a state that was persisted + /// through a cache clearing. + /// + /// It is only correct to call this routine when both a state has been + /// saved and the cache has just been cleared. Otherwise, this panics. + fn saved_state_id(&mut self) -> LazyStateID { + self.cache + .state_saver + .take_saved() + .expect("state saver does not have saved state ID") + } + + /// Set all transitions on the state 'from' to 'to'. + fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) { + for unit in self.dfa.classes.representatives() { + self.set_transition(from, unit, to); + } + } + + /// Set the transition on 'from' for 'unit' to 'to'. + /// + /// This panics if either 'from' or 'to' is invalid. + /// + /// All unit values are OK. + fn set_transition( + &mut self, + from: LazyStateID, + unit: alphabet::Unit, + to: LazyStateID, + ) { + assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from); + assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to); + let offset = + from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit); + self.cache.trans[offset] = to; + } + + /// Set the start ID for the given pattern ID (if given) and starting + /// configuration to the ID given. + /// + /// This panics if 'id' is not valid or if a pattern ID is given and + /// 'starts_for_each_pattern' is not enabled. + fn set_start_state( + &mut self, + pattern_id: Option<PatternID>, + start: Start, + id: LazyStateID, + ) { + assert!(self.as_ref().is_valid(id)); + let start_index = start.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + assert!( + self.dfa.starts_for_each_pattern, + "attempted to search for a specific pattern \ + without enabling starts_for_each_pattern", + ); + let pid = pid.as_usize(); + Start::count() + (Start::count() * pid) + start_index + } + }; + self.cache.starts[index] = id; + } + + /// Returns a state builder from this DFA that might have existing + /// capacity. This helps avoid allocs in cases where a state is built that + /// turns out to already be cached. + /// + /// Callers must put the state builder back with 'put_state_builder', + /// otherwise the allocation reuse won't work. + fn get_state_builder(&mut self) -> StateBuilderEmpty { + core::mem::replace( + &mut self.cache.scratch_state_builder, + StateBuilderEmpty::new(), + ) + } + + /// Puts the given state builder back into this DFA for reuse. + /// + /// Note that building a 'State' from a builder always creates a new alloc, + /// so callers should always put the builder back. + fn put_state_builder(&mut self, builder: StateBuilderNFA) { + let _ = core::mem::replace( + &mut self.cache.scratch_state_builder, + builder.clear(), + ); + } +} + +/// A type that groups methods that require the base NFA/DFA and read-only +/// access to the cache. +#[derive(Debug)] +struct LazyRef<'i, 'c> { + dfa: &'i DFA, + cache: &'c Cache, +} + +impl<'i, 'c> LazyRef<'i, 'c> { + /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. + fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> { + LazyRef { dfa, cache } + } + + /// Return the ID of the start state for the given configuration. + /// + /// If the start state has not yet been computed, then this returns an + /// unknown lazy state ID. + fn get_cached_start_id( + &self, + pattern_id: Option<PatternID>, + start: Start, + ) -> LazyStateID { + let start_index = start.as_usize(); + let index = match pattern_id { + None => start_index, + Some(pid) => { + let pid = pid.as_usize(); + assert!( + pid < self.dfa.pattern_count(), + "invalid pattern ID: {:?}", + pid + ); + Start::count() + (Start::count() * pid) + start_index + } + }; + self.cache.starts[index] + } + + /// Return the cached NFA/DFA powerset state for the given ID. + /// + /// This panics if the given ID does not address a valid state. + fn get_cached_state(&self, sid: LazyStateID) -> &State { + let index = sid.as_usize_untagged() >> self.dfa.stride2(); + &self.cache.states[index] + } + + /// Returns true if and only if the given ID corresponds to a "sentinel" + /// state. + /// + /// A sentinel state is a state that signifies a special condition of + /// search, and where every transition maps back to itself. See LazyStateID + /// for more details. Note that start and match states are _not_ sentinels + /// since they may otherwise be real states with non-trivial transitions. + /// The purposes of sentinel states is purely to indicate something. Their + /// transitions are not meant to be followed. + fn is_sentinel(&self, id: LazyStateID) -> bool { + id == self.unknown_id() || id == self.dead_id() || id == self.quit_id() + } + + /// Returns the ID of the unknown state for this lazy DFA. + fn unknown_id(&self) -> LazyStateID { + // This unwrap is OK since 0 is always a valid state ID. + LazyStateID::new(0).unwrap().to_unknown() + } + + /// Returns the ID of the dead state for this lazy DFA. + fn dead_id(&self) -> LazyStateID { + // This unwrap is OK since the maximum value here is 1 * 512 = 512, + // which is <= 2047 (the maximum state ID on 16-bit systems). Where + // 512 is the worst case for our equivalence classes (every byte is a + // distinct class). + LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead() + } + + /// Returns the ID of the quit state for this lazy DFA. + fn quit_id(&self) -> LazyStateID { + // This unwrap is OK since the maximum value here is 2 * 512 = 1024, + // which is <= 2047 (the maximum state ID on 16-bit systems). Where + // 512 is the worst case for our equivalence classes (every byte is a + // distinct class). + LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit() + } + + /// Returns true if and only if the given ID is valid. + /// + /// An ID is valid if it is both a valid index into the transition table + /// and is a multiple of the DFA's stride. + fn is_valid(&self, id: LazyStateID) -> bool { + let id = id.as_usize_untagged(); + id < self.cache.trans.len() && id % self.dfa.stride() == 0 + } + + /// Returns true if adding the state given would fit in this cache. + fn state_fits_in_cache(&self, state: &State) -> bool { + let needed = self.cache.memory_usage() + + self.memory_usage_for_one_more_state(state.memory_usage()); + needed <= self.dfa.cache_capacity + } + + /// Returns true if adding the state to be built by the given builder would + /// fit in this cache. + fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool { + let needed = self.cache.memory_usage() + + self.memory_usage_for_one_more_state(state.as_bytes().len()); + needed <= self.dfa.cache_capacity + } + + /// Returns the additional memory usage, in bytes, required to add one more + /// state to this cache. The given size should be the heap size, in bytes, + /// that would be used by the new state being added. + fn memory_usage_for_one_more_state( + &self, + state_heap_size: usize, + ) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + const STATE_SIZE: usize = size_of::<State>(); + + self.dfa.stride() * ID_SIZE // additional space needed in trans table + + STATE_SIZE // space in cache.states + + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id + + state_heap_size // heap memory used by state itself + } +} + +/// A simple type that encapsulates the saving of a state ID through a cache +/// clearing. +/// +/// A state ID can be marked for saving with ToSave, while a state ID can be +/// saved itself with Saved. +#[derive(Clone, Debug)] +enum StateSaver { + /// An empty state saver. In this case, no states (other than the special + /// sentinel states) are preserved after clearing the cache. + None, + /// An ID of a state (and the state itself) that should be preserved after + /// the lazy DFA's cache has been cleared. After clearing, the updated ID + /// is stored in 'Saved' since it may have changed. + ToSave { id: LazyStateID, state: State }, + /// An ID that of a state that has been persisted through a lazy DFA + /// cache clearing. The ID recorded here corresonds to an ID that was + /// once marked as ToSave. The IDs are likely not equivalent even though + /// the states they point to are. + Saved(LazyStateID), +} + +impl StateSaver { + /// Create an empty state saver. + fn none() -> StateSaver { + StateSaver::None + } + + /// Replace this state saver with an empty saver, and if this saver is a + /// request to save a state, return that request. + fn take_to_save(&mut self) -> Option<(LazyStateID, State)> { + match core::mem::replace(self, StateSaver::None) { + StateSaver::None | StateSaver::Saved(_) => None, + StateSaver::ToSave { id, state } => Some((id, state)), + } + } + + /// Replace this state saver with an empty saver, and if this saver is a + /// saved state (or a request to save a state), return that state's ID. + /// + /// The idea here is that a request to save a state isn't necessarily + /// honored because it might not be needed. e.g., Some higher level code + /// might request a state to be saved on the off chance that the cache gets + /// cleared when a new state is added at a lower level. But if that new + /// state is never added, then the cache is never cleared and the state and + /// its ID remain unchanged. + fn take_saved(&mut self) -> Option<LazyStateID> { + match core::mem::replace(self, StateSaver::None) { + StateSaver::None => None, + StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id), + } + } +} + +/// The configuration used for building a lazy DFA. +/// +/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The +/// advantage of the former is that it often lets you avoid importing the +/// `Config` type directly. +/// +/// A lazy DFA configuration is a simple data object that is typically used +/// with [`Builder::configure`]. +/// +/// The default configuration guarantees that a search will _never_ return +/// a [`MatchError`] for any haystack or pattern. Setting a quit byte with +/// [`Config::quit`], enabling heuristic support for Unicode word boundaries +/// with [`Config::unicode_word_boundary`], or setting a minimum cache clear +/// count with [`Config::minimum_cache_clear_count`] can in turn cause a search +/// to return an error. See the corresponding configuration options for more +/// details on when those error conditions arise. +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + // As with other configuration types in this crate, we put all our knobs + // in options so that we can distinguish between "default" and "not set." + // This makes it possible to easily combine multiple configurations + // without default values overwriting explicitly specified values. See the + // 'overwrite' method. + // + // For docs on the fields below, see the corresponding method setters. + anchored: Option<bool>, + match_kind: Option<MatchKind>, + starts_for_each_pattern: Option<bool>, + byte_classes: Option<bool>, + unicode_word_boundary: Option<bool>, + quitset: Option<ByteSet>, + cache_capacity: Option<usize>, + skip_cache_capacity_check: Option<bool>, + minimum_cache_clear_count: Option<Option<usize>>, +} + +impl Config { + /// Return a new default lazy DFA builder configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set whether matching must be anchored at the beginning of the input. + /// + /// When enabled, a match must begin at the start of a search. When + /// disabled (the default), the lazy DFA will act as if the pattern started + /// with a `(?s:.)*?`, which enables a match to appear anywhere. + /// + /// Note that if you want to run both anchored and unanchored + /// searches without building multiple automatons, you can enable the + /// [`Config::starts_for_each_pattern`] configuration instead. This will + /// permit unanchored any-pattern searches and pattern-specific anchored + /// searches. See the documentation for that configuration for an example. + /// + /// By default this is disabled. + /// + /// **WARNING:** this is subtly different than using a `^` at the start of + /// your regex. A `^` forces a regex to match exclusively at the start of + /// input, regardless of where you begin your search. In contrast, enabling + /// this option will allow your regex to match anywhere in your input, + /// but the match must start at the beginning of a search. (Most of the + /// higher level convenience search routines make "start of input" and + /// "start of search" equivalent, but some routines allow treating these as + /// orthogonal.) + /// + /// For example, consider the haystack `aba` and the following searches: + /// + /// 1. The regex `^a` is compiled with `anchored=false` and searches + /// `aba` starting at position `2`. Since `^` requires the match to + /// start at the beginning of the input and `2 > 0`, no match is found. + /// 2. The regex `a` is compiled with `anchored=true` and searches `aba` + /// starting at position `2`. This reports a match at `[2, 3]` since + /// the match starts where the search started. Since there is no `^`, + /// there is no requirement for the match to start at the beginning of + /// the input. + /// 3. The regex `a` is compiled with `anchored=true` and searches `aba` + /// starting at position `1`. Since `b` corresponds to position `1` and + /// since the regex is anchored, it finds no match. + /// 4. The regex `a` is compiled with `anchored=false` and searches `aba` + /// startting at position `1`. Since the regex is neither anchored nor + /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?` + /// prefix that permits it to match anywhere. Thus, it reports a match + /// at `[2, 3]`. + /// + /// # Example + /// + /// This demonstrates the differences between an anchored search and + /// a pattern that begins with `^` (as described in the above warning + /// message). + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// + /// let haystack = "aba".as_bytes(); + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().anchored(false)) // default + /// .build(r"^a")?; + /// let mut cache = dfa.create_cache(); + /// let got = dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 2, 3, + /// )?; + /// // No match is found because 2 is not the beginning of the haystack, + /// // which is what ^ requires. + /// let expected = None; + /// assert_eq!(expected, got); + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().anchored(true)) + /// .build(r"a")?; + /// let mut cache = dfa.create_cache(); + /// let got = dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 2, 3, + /// )?; + /// // An anchored search can still match anywhere in the haystack, it just + /// // must begin at the start of the search which is '2' in this case. + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, got); + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().anchored(true)) + /// .build(r"a")?; + /// let mut cache = dfa.create_cache(); + /// let got = dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 1, 3, + /// )?; + /// // No match is found since we start searching at offset 1 which + /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match + /// // is found. + /// let expected = None; + /// assert_eq!(expected, got); + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().anchored(false)) + /// .build(r"a")?; + /// let mut cache = dfa.create_cache(); + /// let got = dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 1, 3, + /// )?; + /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the + /// // pattern. Even though the search starts at 'b', the 'match anything' + /// // prefix allows the search to match 'a'. + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn anchored(mut self, yes: bool) -> Config { + self.anchored = Some(yes); + self + } + + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to classical DFA construction + /// where all possible matches are added to the lazy DFA. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` adds dead states to the + /// lazy DFA as a way to terminate the search and report a match. + /// `LeftmostFirst` also supports non-greedy matches using this strategy + /// where as `All` does not. + /// + /// # Example: overlapping search + /// + /// This example shows the typical use of `MatchKind::All`, which is to + /// report overlapping matches. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, OverlappingState}, + /// HalfMatch, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = dfa.create_cache(); + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(HalfMatch::must(1, 4)); + /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(HalfMatch::must(0, 4)); + /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: reverse automaton to find start of match + /// + /// Another example for using `MatchKind::All` is for constructing a + /// reverse automaton to find the start of a match. `All` semantics are + /// used for this in order to find the longest possible match, which + /// corresponds to the leftmost starting position. + /// + /// Note that if you need the starting position then + /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this + /// for you, so it's usually not necessary to do this yourself. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchKind}; + /// + /// let haystack = "123foobar456".as_bytes(); + /// let pattern = r"[a-z]+"; + /// + /// let dfa_fwd = DFA::new(pattern)?; + /// let dfa_rev = DFA::builder() + /// .configure(DFA::config() + /// .anchored(true) + /// .match_kind(MatchKind::All) + /// ) + /// .build(pattern)?; + /// let mut cache_fwd = dfa_fwd.create_cache(); + /// let mut cache_rev = dfa_rev.create_cache(); + /// + /// let expected_fwd = HalfMatch::must(0, 9); + /// let expected_rev = HalfMatch::must(0, 3); + /// let got_fwd = dfa_fwd.find_leftmost_fwd( + /// &mut cache_fwd, haystack, + /// )?.unwrap(); + /// // Here we don't specify the pattern to search for since there's only + /// // one pattern and we're doing a leftmost search. But if this were an + /// // overlapping search, you'd need to specify the pattern that matched + /// // in the forward direction. (Otherwise, you might wind up finding the + /// // starting position of a match of some other pattern.) That in turn + /// // requires building the reverse automaton with starts_for_each_pattern + /// // enabled. Indeed, this is what Regex does internally. + /// let got_rev = dfa_rev.find_leftmost_rev_at( + /// &mut cache_rev, None, haystack, 0, got_fwd.offset(), + /// )?.unwrap(); + /// assert_eq!(expected_fwd, got_fwd); + /// assert_eq!(expected_rev, got_rev); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); + self + } + + /// Whether to compile a separate start state for each pattern in the + /// lazy DFA. + /// + /// When enabled, a separate **anchored** start state is added for each + /// pattern in the lazy DFA. When this start state is used, then the DFA + /// will only search for matches for the pattern specified, even if there + /// are other patterns in the DFA. + /// + /// The main downside of this option is that it can potentially increase + /// the size of the DFA and/or increase the time it takes to build the + /// DFA at search time. However, since this is configuration for a lazy + /// DFA, these states aren't actually built unless they're used. Enabling + /// this isn't necessarily free, however, as it may result in higher cache + /// usage. + /// + /// There are a few reasons one might want to enable this (it's disabled + /// by default): + /// + /// 1. When looking for the start of an overlapping match (using a reverse + /// DFA), doing it correctly requires starting the reverse search using the + /// starting state of the pattern that matched in the forward direction. + /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it + /// will automatically enable this option when building the reverse DFA + /// internally. + /// 2. When you want to use a DFA with multiple patterns to both search + /// for matches of any pattern or to search for anchored matches of one + /// particular pattern while using the same DFA. (Otherwise, you would need + /// to compile a new DFA for each pattern.) + /// 3. Since the start states added for each pattern are anchored, if you + /// compile an unanchored DFA with one pattern while also enabling this + /// option, then you can use the same DFA to perform anchored or unanchored + /// searches. The latter you get with the standard search APIs. The former + /// you get from the various `_at` search methods that allow you specify a + /// pattern ID to search for. + /// + /// By default this is disabled. + /// + /// # Example + /// + /// This example shows how to use this option to permit the same lazy DFA + /// to run both anchored and unanchored searches for a single pattern. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, PatternID}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build(r"foo[0-9]+")?; + /// let mut cache = dfa.create_cache(); + /// let haystack = b"quux foo123"; + /// + /// // Here's a normal unanchored search. Notice that we use 'None' for the + /// // pattern ID. Since the DFA was built as an unanchored machine, it + /// // uses its default unanchored starting state. + /// let expected = HalfMatch::must(0, 11); + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( + /// &mut cache, None, None, haystack, 0, haystack.len(), + /// )?); + /// // But now if we explicitly specify the pattern to search ('0' being + /// // the only pattern in the DFA), then it will use the starting state + /// // for that specific pattern which is always anchored. Since the + /// // pattern doesn't have a match at the beginning of the haystack, we + /// // find nothing. + /// assert_eq!(None, dfa.find_leftmost_fwd_at( + /// &mut cache, None, Some(PatternID::must(0)), haystack, 0, haystack.len(), + /// )?); + /// // And finally, an anchored search is not the same as putting a '^' at + /// // beginning of the pattern. An anchored search can only match at the + /// // beginning of the *search*, which we can change: + /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( + /// &mut cache, None, Some(PatternID::must(0)), haystack, 5, haystack.len(), + /// )?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { + self.starts_for_each_pattern = Some(yes); + self + } + + /// Whether to attempt to shrink the size of the lazy DFA's alphabet or + /// not. + /// + /// This option is enabled by default and should never be disabled unless + /// one is debugging the lazy DFA. + /// + /// When enabled, the lazy DFA will use a map from all possible bytes + /// to their corresponding equivalence class. Each equivalence class + /// represents a set of bytes that does not discriminate between a match + /// and a non-match in the DFA. For example, the pattern `[ab]+` has at + /// least two equivalence classes: a set containing `a` and `b` and a set + /// containing every byte except for `a` and `b`. `a` and `b` are in the + /// same equivalence classes because they never discriminate between a + /// match and a non-match. + /// + /// The advantage of this map is that the size of the transition table + /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)` + /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of + /// equivalence classes (rounded up to the nearest power of 2). As a + /// result, total space usage can decrease substantially. Moreover, since a + /// smaller alphabet is used, DFA compilation during search becomes faster + /// as well since it will potentially be able to reuse a single transition + /// for multiple bytes. + /// + /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling + /// this does not yield any speed advantages. Namely, even when this is + /// disabled, a byte class map is still used while searching. The only + /// difference is that every byte will be forced into its own distinct + /// equivalence class. This is useful for debugging the actual generated + /// transitions because it lets one see the transitions defined on actual + /// bytes instead of the equivalence classes. + pub fn byte_classes(mut self, yes: bool) -> Config { + self.byte_classes = Some(yes); + self + } + + /// Heuristically enable Unicode word boundaries. + /// + /// When set, this will attempt to implement Unicode word boundaries as if + /// they were ASCII word boundaries. This only works when the search input + /// is ASCII only. If a non-ASCII byte is observed while searching, then a + /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned. + /// + /// A possible alternative to enabling this option is to simply use an + /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this + /// option is if you absolutely need Unicode support. This option lets one + /// use a fast search implementation (a DFA) for some potentially very + /// common cases, while providing the option to fall back to some other + /// regex engine to handle the general case when an error is returned. + /// + /// If the pattern provided has no Unicode word boundary in it, then this + /// option has no effect. (That is, quitting on a non-ASCII byte only + /// occurs when this option is enabled _and_ a Unicode word boundary is + /// present in the pattern.) + /// + /// This is almost equivalent to setting all non-ASCII bytes to be quit + /// bytes. The only difference is that this will cause non-ASCII bytes to + /// be quit bytes _only_ when a Unicode word boundary is present in the + /// pattern. + /// + /// When enabling this option, callers _must_ be prepared to handle + /// a [`MatchError`](crate::MatchError) error during search. + /// When using a [`Regex`](crate::hybrid::regex::Regex), this + /// corresponds to using the `try_` suite of methods. Alternatively, + /// if callers can guarantee that their input is ASCII only, then a + /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be + /// returned while searching. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows how to heuristically enable Unicode word boundaries + /// in a pattern. It also shows what happens when a search comes across a + /// non-ASCII byte. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// HalfMatch, MatchError, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().unicode_word_boundary(true)) + /// .build(r"\b[0-9]+\b")?; + /// let mut cache = dfa.create_cache(); + /// + /// // The match occurs before the search ever observes the snowman + /// // character, so no error occurs. + /// let haystack = "foo 123 ☃".as_bytes(); + /// let expected = Some(HalfMatch::must(0, 7)); + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; + /// assert_eq!(expected, got); + /// + /// // Notice that this search fails, even though the snowman character + /// // occurs after the ending match offset. This is because search + /// // routines read one byte past the end of the search to account for + /// // look-around, and indeed, this is required here to determine whether + /// // the trailing \b matches. + /// let haystack = "foo 123☃".as_bytes(); + /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 }; + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack); + /// assert_eq!(Err(expected), got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn unicode_word_boundary(mut self, yes: bool) -> Config { + // We have a separate option for this instead of just setting the + // appropriate quit bytes here because we don't want to set quit bytes + // for every regex. We only want to set them when the regex contains a + // Unicode word boundary. + self.unicode_word_boundary = Some(yes); + self + } + + /// Add a "quit" byte to the lazy DFA. + /// + /// When a quit byte is seen during search time, then search will return + /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the + /// offset at which the search stopped. + /// + /// A quit byte will always overrule any other aspects of a regex. For + /// example, if the `x` byte is added as a quit byte and the regex `\w` is + /// used, then observing `x` will cause the search to quit immediately + /// despite the fact that `x` is in the `\w` class. + /// + /// This mechanism is primarily useful for heuristically enabling certain + /// features like Unicode word boundaries in a DFA. Namely, if the input + /// to search is ASCII, then a Unicode word boundary can be implemented + /// via an ASCII word boundary with no change in semantics. Thus, a DFA + /// can attempt to match a Unicode word boundary but give up as soon as it + /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes + /// to be quit bytes, then Unicode word boundaries will be permitted when + /// building lazy DFAs. Of course, callers should enable + /// [`Config::unicode_word_boundary`] if they want this behavior instead. + /// (The advantage being that non-ASCII quit bytes will only be added if a + /// Unicode word boundary is in the pattern.) + /// + /// When enabling this option, callers _must_ be prepared to handle a + /// [`MatchError`](crate::MatchError) error during search. When using a + /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the + /// `try_` suite of methods. + /// + /// By default, there are no quit bytes set. + /// + /// # Panics + /// + /// This panics if heuristic Unicode word boundaries are enabled and any + /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling + /// Unicode word boundaries requires setting every non-ASCII byte to a quit + /// byte. So if the caller attempts to undo any of that, then this will + /// panic. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte. This could be useful if, for example, you wanted to prevent + /// a user supplied pattern from matching across a line boundary. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "foo\nbar".as_bytes(); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn quit(mut self, byte: u8, yes: bool) -> Config { + if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { + panic!( + "cannot set non-ASCII byte to be non-quit when \ + Unicode word boundaries are enabled" + ); + } + if self.quitset.is_none() { + self.quitset = Some(ByteSet::empty()); + } + if yes { + self.quitset.as_mut().unwrap().add(byte); + } else { + self.quitset.as_mut().unwrap().remove(byte); + } + self + } + + /// Sets the maximum amount of heap memory, in bytes, to allocate to the + /// cache for use during a lazy DFA search. If the lazy DFA would otherwise + /// use more heap memory, then, depending on other configuration knobs, + /// either stop the search and return an error or clear the cache and + /// continue the search. + /// + /// The default cache capacity is some "reasonable" number that will + /// accommodate most regular expressions. You may find that if you need + /// to build a large DFA then it may be necessary to increase the cache + /// capacity. + /// + /// Note that while building a lazy DFA will do a "minimum" check to ensure + /// the capacity is big enough, this is more or less about correctness. + /// If the cache is bigger than the minimum but still too small, then the + /// lazy DFA could wind up spending a lot of time clearing the cache and + /// recomputing transitions, thus negating the performance benefits of a + /// lazy DFA. Thus, setting the cache capacity is mostly an experimental + /// endeavor. For most common patterns, however, the default should be + /// sufficient. + /// + /// For more details on how the lazy DFA's cache is used, see the + /// documentation for [`Cache`]. + /// + /// # Example + /// + /// This example shows what happens if the configured cache capacity is + /// too small. In such cases, one can override the cache capacity to make + /// it bigger. Alternatively, one might want to use less memory by setting + /// a smaller cache capacity. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// + /// let pattern = r"\p{L}{1000}"; + /// + /// // The default cache capacity is likely too small to deal with regexes + /// // that are very large. Large repetitions of large Unicode character + /// // classes are a common way to make very large regexes. + /// let _ = DFA::new(pattern).unwrap_err(); + /// // Bump up the capacity to something bigger. + /// let dfa = DFA::builder() + /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); + /// let expected = Some(HalfMatch::must(0, 2000)); + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn cache_capacity(mut self, bytes: usize) -> Config { + self.cache_capacity = Some(bytes); + self + } + + /// Configures construction of a lazy DFA to use the minimum cache capacity + /// if the configured capacity is otherwise too small for the provided NFA. + /// + /// This is useful if you never want lazy DFA construction to fail because + /// of a capacity that is too small. + /// + /// In general, this option is typically not a good idea. In particular, + /// while a minimum cache capacity does permit the lazy DFA to function + /// where it otherwise couldn't, it's plausible that it may not function + /// well if it's constantly running out of room. In that case, the speed + /// advantages of the lazy DFA may be negated. + /// + /// This is disabled by default. + /// + /// # Example + /// + /// This example shows what happens if the configured cache capacity is + /// too small. In such cases, one could override the capacity explicitly. + /// An alternative, demonstrated here, let's us force construction to use + /// the minimum cache capacity if the configured capacity is otherwise + /// too small. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// + /// let pattern = r"\p{L}{1000}"; + /// + /// // The default cache capacity is likely too small to deal with regexes + /// // that are very large. Large repetitions of large Unicode character + /// // classes are a common way to make very large regexes. + /// let _ = DFA::new(pattern).unwrap_err(); + /// // Configure construction such it automatically selects the minimum + /// // cache capacity if it would otherwise be too small. + /// let dfa = DFA::builder() + /// .configure(DFA::config().skip_cache_capacity_check(true)) + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); + /// let expected = Some(HalfMatch::must(0, 2000)); + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config { + self.skip_cache_capacity_check = Some(yes); + self + } + + /// Configure a lazy DFA search to quit after a certain number of cache + /// clearings. + /// + /// When a minimum is set, then a lazy DFA search will "give up" after + /// the minimum number of cache clearings has occurred. This is typically + /// useful in scenarios where callers want to detect whether the lazy DFA + /// search is "efficient" or not. If the cache is cleared too many times, + /// this is a good indicator that it is not efficient, and thus, the caller + /// may wish to use some other regex engine. + /// + /// Note that the number of times a cache is cleared is a property of + /// the cache itself. Thus, if a cache is used in a subsequent search + /// with a similarly configured lazy DFA, then it would cause the + /// search to "give up" if the cache needed to be cleared. The cache + /// clear count can only be reset to `0` via [`DFA::reset_cache`] (or + /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if + /// you're using the `Regex` API). + /// + /// By default, no minimum is configured. Thus, a lazy DFA search will + /// never give up due to cache clearings. + /// + /// # Example + /// + /// This example uses a somewhat pathological configuration to demonstrate + /// the _possible_ behavior of cache clearing and how it might result + /// in a search that returns an error. + /// + /// It is important to note that the precise mechanics of how and when + /// a cache gets cleared is an implementation detail. Thus, the asserts + /// in the tests below with respect to the particular offsets at which a + /// search gave up should be viewed strictly as a demonstration. They are + /// not part of any API guarantees offered by this crate. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, MatchError}; + /// + /// // This is a carefully chosen regex. The idea is to pick one + /// // that requires some decent number of states (hence the bounded + /// // repetition). But we specifically choose to create a class with an + /// // ASCII letter and a non-ASCII letter so that we can check that no new + /// // states are created once the cache is full. Namely, if we fill up the + /// // cache on a haystack of 'a's, then in order to match one 'β', a new + /// // state will need to be created since a 'β' is encoded with multiple + /// // bytes. Since there's no room for this state, the search should quit + /// // at the very first position. + /// let pattern = r"[aβ]{100}"; + /// let dfa = DFA::builder() + /// .configure( + /// // Configure it so that we have the minimum cache capacity + /// // possible. And that if any clearings occur, the search quits. + /// DFA::config() + /// .skip_cache_capacity_check(true) + /// .cache_capacity(0) + /// .minimum_cache_clear_count(Some(0)), + /// ) + /// .build(pattern)?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "a".repeat(101).into_bytes(); + /// assert_eq!( + /// dfa.find_leftmost_fwd(&mut cache, &haystack), + /// Err(MatchError::GaveUp { offset: 25 }), + /// ); + /// + /// // Now that we know the cache is full, if we search a haystack that we + /// // know will require creating at least one new state, it should not + /// // be able to make any progress. + /// let haystack = "β".repeat(101).into_bytes(); + /// assert_eq!( + /// dfa.find_leftmost_fwd(&mut cache, &haystack), + /// Err(MatchError::GaveUp { offset: 0 }), + /// ); + /// + /// // If we reset the cache, then we should be able to create more states + /// // and make more progress with searching for betas. + /// cache.reset(&dfa); + /// let haystack = "β".repeat(101).into_bytes(); + /// assert_eq!( + /// dfa.find_earliest_fwd(&mut cache, &haystack), + /// Err(MatchError::GaveUp { offset: 26 }), + /// ); + /// + /// // ... switching back to ASCII still makes progress since it just needs + /// // to set transitions on existing states! + /// let haystack = "a".repeat(101).into_bytes(); + /// assert_eq!( + /// dfa.find_earliest_fwd(&mut cache, &haystack), + /// Err(MatchError::GaveUp { offset: 13 }), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config { + self.minimum_cache_clear_count = Some(min); + self + } + + /// Returns whether this configuration has enabled anchored searches. + pub fn get_anchored(&self) -> bool { + self.anchored.unwrap_or(false) + } + + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) + } + + /// Returns whether this configuration has enabled anchored starting states + /// for every pattern in the DFA. + pub fn get_starts_for_each_pattern(&self) -> bool { + self.starts_for_each_pattern.unwrap_or(false) + } + + /// Returns whether this configuration has enabled byte classes or not. + /// This is typically a debugging oriented option, as disabling it confers + /// no speed benefit. + pub fn get_byte_classes(&self) -> bool { + self.byte_classes.unwrap_or(true) + } + + /// Returns whether this configuration has enabled heuristic Unicode word + /// boundary support. When enabled, it is possible for a search to return + /// an error. + pub fn get_unicode_word_boundary(&self) -> bool { + self.unicode_word_boundary.unwrap_or(false) + } + + /// Returns whether this configuration will instruct the DFA to enter a + /// quit state whenever the given byte is seen during a search. When at + /// least one byte has this enabled, it is possible for a search to return + /// an error. + pub fn get_quit(&self, byte: u8) -> bool { + self.quitset.map_or(false, |q| q.contains(byte)) + } + + /// Returns the cache capacity set on this configuration. + pub fn get_cache_capacity(&self) -> usize { + self.cache_capacity.unwrap_or(2 * (1 << 20)) + } + + /// Returns whether the cache capacity check should be skipped. + pub fn get_skip_cache_capacity_check(&self) -> bool { + self.skip_cache_capacity_check.unwrap_or(false) + } + + /// Returns, if set, the minimum number of times the cache must be cleared + /// before a lazy DFA search can give up. When no minimum is set, then a + /// search will never quit and will always clear the cache whenever it + /// fills up. + pub fn get_minimum_cache_clear_count(&self) -> Option<usize> { + self.minimum_cache_clear_count.unwrap_or(None) + } + + /// Returns the minimum lazy DFA cache capacity required for the given NFA. + /// + /// The cache capacity required for a particular NFA may change without + /// notice. Callers should not rely on it being stable. + /// + /// This is useful for informational purposes, but can also be useful for + /// other reasons. For example, if one wants to check the minimum cache + /// capacity themselves or if one wants to set the capacity based on the + /// minimum. + /// + /// This may return an error if this configuration does not support all of + /// the instructions used in the given NFA. For example, if the NFA has a + /// Unicode word boundary but this configuration does not enable heuristic + /// support for Unicode word boundaries. + pub fn get_minimum_cache_capacity( + &self, + nfa: &thompson::NFA, + ) -> Result<usize, BuildError> { + let quitset = self.quit_set_from_nfa(nfa)?; + let classes = self.byte_classes_from_nfa(nfa, &quitset); + let starts = self.get_starts_for_each_pattern(); + Ok(minimum_cache_capacity(nfa, &classes, starts)) + } + + /// Returns the byte class map used during search from the given NFA. + /// + /// If byte classes are disabled on this configuration, then a map is + /// returned that puts each byte in its own equivalent class. + fn byte_classes_from_nfa( + &self, + nfa: &thompson::NFA, + quit: &ByteSet, + ) -> ByteClasses { + if !self.get_byte_classes() { + // The lazy DFA will always use the equivalence class map, but + // enabling this option is useful for debugging. Namely, this will + // cause all transitions to be defined over their actual bytes + // instead of an opaque equivalence class identifier. The former is + // much easier to grok as a human. + ByteClasses::singletons() + } else { + let mut set = nfa.byte_class_set().clone(); + // It is important to distinguish any "quit" bytes from all other + // bytes. Otherwise, a non-quit byte may end up in the same class + // as a quit byte, and thus cause the DFA stop when it shouldn't. + if !quit.is_empty() { + set.add_set(&quit); + } + set.byte_classes() + } + } + + /// Return the quit set for this configuration and the given NFA. + /// + /// This may return an error if the NFA is incompatible with this + /// configuration's quit set. For example, if the NFA has a Unicode word + /// boundary and the quit set doesn't include non-ASCII bytes. + fn quit_set_from_nfa( + &self, + nfa: &thompson::NFA, + ) -> Result<ByteSet, BuildError> { + let mut quit = self.quitset.unwrap_or(ByteSet::empty()); + if nfa.has_word_boundary_unicode() { + if self.get_unicode_word_boundary() { + for b in 0x80..=0xFF { + quit.add(b); + } + } else { + // If heuristic support for Unicode word boundaries wasn't + // enabled, then we can still check if our quit set is correct. + // If the caller set their quit bytes in a way that causes the + // DFA to quit on at least all non-ASCII bytes, then that's all + // we need for heuristic support to work. + if !quit.contains_range(0x80, 0xFF) { + return Err( + BuildError::unsupported_dfa_word_boundary_unicode(), + ); + } + } + } + Ok(quit) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + fn overwrite(self, o: Config) -> Config { + Config { + anchored: o.anchored.or(self.anchored), + match_kind: o.match_kind.or(self.match_kind), + starts_for_each_pattern: o + .starts_for_each_pattern + .or(self.starts_for_each_pattern), + byte_classes: o.byte_classes.or(self.byte_classes), + unicode_word_boundary: o + .unicode_word_boundary + .or(self.unicode_word_boundary), + quitset: o.quitset.or(self.quitset), + cache_capacity: o.cache_capacity.or(self.cache_capacity), + skip_cache_capacity_check: o + .skip_cache_capacity_check + .or(self.skip_cache_capacity_check), + minimum_cache_clear_count: o + .minimum_cache_clear_count + .or(self.minimum_cache_clear_count), + } + } +} + +/// A builder for constructing a lazy deterministic finite automaton from +/// regular expressions. +/// +/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The +/// advantage of the former is that it often lets you avoid importing the +/// `Builder` type directly. +/// +/// This builder provides two main things: +/// +/// 1. It provides a few different `build` routines for actually constructing +/// a DFA from different kinds of inputs. The most convenient is +/// [`Builder::build`], which builds a DFA directly from a pattern string. The +/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight +/// from an NFA. +/// 2. The builder permits configuring a number of things. +/// [`Builder::configure`] is used with [`Config`] to configure aspects of +/// the DFA and the construction process itself. [`Builder::syntax`] and +/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA +/// construction, respectively. The syntax and thompson configurations only +/// apply when building from a pattern string. +/// +/// This builder always constructs a *single* lazy DFA. As such, this builder +/// can only be used to construct regexes that either detect the presence +/// of a match or find the end location of a match. A single DFA cannot +/// produce both the start and end of a match. For that information, use a +/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured +/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason +/// to use a DFA directly is if the end location of a match is enough for your +/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one, +/// since a second reverse DFA is needed to find the start of a match. +/// +/// # Example +/// +/// This example shows how to build a lazy DFA that uses a tiny cache capacity +/// and completely disables Unicode. That is: +/// +/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` +/// and `\b` are ASCII-only while `.` matches any byte except for `\n` +/// (instead of any UTF-8 encoding of a Unicode scalar value except for +/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. +/// * The pattern itself is permitted to match invalid UTF-8. For example, +/// things like `[^a]` that match any byte except for `a` are permitted. +/// * Unanchored patterns can search through invalid UTF-8. That is, for +/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of +/// `(?s:.)*?`. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::DFA, +/// nfa::thompson, +/// HalfMatch, SyntaxConfig, +/// }; +/// +/// let dfa = DFA::builder() +/// .configure(DFA::config().cache_capacity(5_000)) +/// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo[^b]ar.*")?; +/// let mut cache = dfa.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; +/// let expected = Some(HalfMatch::must(0, 10)); +/// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + thompson: thompson::Builder, +} + +impl Builder { + /// Create a new lazy DFA builder with the default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + thompson: thompson::Builder::new(), + } + } + + /// Build a lazy DFA from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a lazy DFA from the given patterns. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<DFA, BuildError> { + let nfa = + self.thompson.build_many(patterns).map_err(BuildError::nfa)?; + self.build_from_nfa(Arc::new(nfa)) + } + + /// Build a DFA from the given NFA. + /// + /// Note that this requires an `Arc<thompson::NFA>` instead of a + /// `&thompson::NFA` because the lazy DFA builds itself from the NFA at + /// search time. This means that the lazy DFA must hold on to its source + /// NFA for the entirety of its lifetime. An `Arc` is used so that callers + /// aren't forced to clone the NFA if it is needed elsewhere. + /// + /// # Example + /// + /// This example shows how to build a lazy DFA if you already have an NFA + /// in hand. + /// + /// ``` + /// use std::sync::Arc; + /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch}; + /// + /// let haystack = "foo123bar".as_bytes(); + /// + /// // This shows how to set non-default options for building an NFA. + /// let nfa = thompson::Builder::new() + /// .configure(thompson::Config::new().shrink(false)) + /// .build(r"[0-9]+")?; + /// let dfa = DFA::builder().build_from_nfa(Arc::new(nfa))?; + /// let mut cache = dfa.create_cache(); + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_nfa( + &self, + nfa: Arc<thompson::NFA>, + ) -> Result<DFA, BuildError> { + let quitset = self.config.quit_set_from_nfa(&nfa)?; + let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); + // Check that we can fit at least a few states into our cache, + // otherwise it's pretty senseless to use the lazy DFA. This does have + // a possible failure mode though. This assumes the maximum size of a + // state in powerset space (so, the total number of NFA states), which + // may never actually materialize, and could be quite a bit larger + // than the actual biggest state. If this turns out to be a problem, + // we could expose a knob that disables this check. But if so, we have + // to be careful not to panic in other areas of the code (the cache + // clearing and init code) that tend to assume some minimum useful + // cache capacity. + let min_cache = minimum_cache_capacity( + &nfa, + &classes, + self.config.get_starts_for_each_pattern(), + ); + let mut cache_capacity = self.config.get_cache_capacity(); + if cache_capacity < min_cache { + // When the caller has asked us to skip the cache capacity check, + // then we simply force the cache capacity to its minimum amount + // and mush on. + if self.config.get_skip_cache_capacity_check() { + trace!( + "given capacity ({}) is too small, \ + since skip_cache_capacity_check is enabled, \ + setting cache capacity to minimum ({})", + cache_capacity, + min_cache, + ); + cache_capacity = min_cache; + } else { + return Err(BuildError::insufficient_cache_capacity( + min_cache, + cache_capacity, + )); + } + } + // We also need to check that we can fit at least some small number + // of states in our state ID space. This is unlikely to trigger in + // >=32-bit systems, but 16-bit systems have a pretty small state ID + // space since a number of bits are used up as sentinels. + if let Err(err) = minimum_lazy_state_id(&nfa, &classes) { + return Err(BuildError::insufficient_state_id_capacity(err)); + } + let stride2 = classes.stride2(); + Ok(DFA { + nfa, + stride2, + classes, + quitset, + anchored: self.config.get_anchored(), + match_kind: self.config.get_match_kind(), + starts_for_each_pattern: self.config.get_starts_for_each_pattern(), + cache_capacity, + minimum_cache_clear_count: self + .config + .get_minimum_cache_clear_count(), + }) + } + + /// Apply the given lazy DFA configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a lazy DFA directly from a + /// pattern. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like whether the DFA should match the regex + /// in reverse or if additional time should be spent shrinking the size of + /// the NFA. + /// + /// These settings only apply when constructing a DFA directly from a + /// pattern. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +/// Based on the minimum number of states required for a useful lazy DFA cache, +/// this returns the minimum lazy state ID that must be representable. +/// +/// It's likely not plausible for this to impose constraints on 32-bit systems +/// (or higher), but on 16-bit systems, the lazy state ID space is quite +/// constrained and thus may be insufficient for bigger regexes. +fn minimum_lazy_state_id( + nfa: &thompson::NFA, + classes: &ByteClasses, +) -> Result<LazyStateID, LazyStateIDError> { + let stride = 1 << classes.stride2(); + let min_state_index = MIN_STATES.checked_sub(1).unwrap(); + LazyStateID::new(min_state_index * stride) +} + +/// Based on the minimum number of states required for a useful lazy DFA cache, +/// this returns a heuristic minimum number of bytes of heap space required. +/// +/// This is a "heuristic" because the minimum it returns is likely bigger than +/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses +/// the maximum number of NFA states (all of them). This is likely bigger +/// than what is required in practice. Computing the true minimum effectively +/// requires determinization, which is probably too much work to do for a +/// simple check like this. +fn minimum_cache_capacity( + nfa: &thompson::NFA, + classes: &ByteClasses, + starts_for_each_pattern: bool, +) -> usize { + const ID_SIZE: usize = size_of::<LazyStateID>(); + let stride = 1 << classes.stride2(); + + let sparses = 2 * nfa.len() * NFAStateID::SIZE; + let trans = MIN_STATES * stride * ID_SIZE; + + let mut starts = Start::count() * ID_SIZE; + if starts_for_each_pattern { + starts += (Start::count() * nfa.pattern_len()) * ID_SIZE; + } + + // Every `State` has three bytes for flags, 4 bytes (max) for the number + // of patterns, followed by 32-bit encodings of patterns and then delta + // varint encodings of NFA state IDs. We use the worst case (which isn't + // technically possible) of 5 bytes for each NFA state ID. + // + // HOWEVER, three of the states needed by a lazy DFA are just the sentinel + // unknown, dead and quit states. Those states have a known size and it is + // small. + assert!(MIN_STATES >= 3, "minimum number of states has to be at least 3"); + let dead_state_size = State::dead().memory_usage(); + let max_state_size = 3 + 4 + (nfa.pattern_len() * 4) + (nfa.len() * 5); + let states = (3 * (size_of::<State>() + dead_state_size)) + + ((MIN_STATES - 3) * (size_of::<State>() + max_state_size)); + let states_to_sid = states + (MIN_STATES * ID_SIZE); + let stack = nfa.len() * NFAStateID::SIZE; + let scratch_state_builder = max_state_size; + + trans + + starts + + states + + states_to_sid + + sparses + + stack + + scratch_state_builder +} diff --git a/vendor/regex-automata/src/hybrid/error.rs b/vendor/regex-automata/src/hybrid/error.rs new file mode 100644 index 000000000..715da39bd --- /dev/null +++ b/vendor/regex-automata/src/hybrid/error.rs @@ -0,0 +1,130 @@ +use crate::{hybrid::id::LazyStateIDError, nfa}; + +/// An error that occurs when initial construction of a lazy DFA fails. +/// +/// A build error can occur when insufficient cache capacity is configured or +/// if something about the NFA is unsupported. (For example, if one attempts +/// to build a lazy DFA without heuristic Unicode support but with an NFA that +/// contains a Unicode word boundary.) +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct BuildError { + kind: BuildErrorKind, +} + +#[derive(Clone, Debug)] +enum BuildErrorKind { + NFA(nfa::thompson::Error), + InsufficientCacheCapacity { minimum: usize, given: usize }, + InsufficientStateIDCapacity { err: LazyStateIDError }, + Unsupported(&'static str), +} + +impl BuildError { + fn kind(&self) -> &BuildErrorKind { + &self.kind + } + + pub(crate) fn nfa(err: nfa::thompson::Error) -> BuildError { + BuildError { kind: BuildErrorKind::NFA(err) } + } + + pub(crate) fn insufficient_cache_capacity( + minimum: usize, + given: usize, + ) -> BuildError { + BuildError { + kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given }, + } + } + + pub(crate) fn insufficient_state_id_capacity( + err: LazyStateIDError, + ) -> BuildError { + BuildError { + kind: BuildErrorKind::InsufficientStateIDCapacity { err }, + } + } + + pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError { + let msg = "cannot build lazy DFAs for regexes with Unicode word \ + boundaries; switch to ASCII word boundaries, or \ + heuristically enable Unicode word boundaries or use a \ + different regex engine"; + BuildError { kind: BuildErrorKind::Unsupported(msg) } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self.kind() { + BuildErrorKind::NFA(ref err) => Some(err), + BuildErrorKind::InsufficientCacheCapacity { .. } => None, + // LazyStateIDError is an implementation detail, don't expose it. + BuildErrorKind::InsufficientStateIDCapacity { .. } => None, + BuildErrorKind::Unsupported(_) => None, + } + } +} + +impl core::fmt::Display for BuildError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.kind() { + BuildErrorKind::NFA(_) => write!(f, "error building NFA"), + BuildErrorKind::InsufficientCacheCapacity { minimum, given } => { + write!( + f, + "given cache capacity ({}) is smaller than \ + minimum required ({})", + given, minimum, + ) + } + BuildErrorKind::InsufficientStateIDCapacity { ref err } => { + err.fmt(f) + } + BuildErrorKind::Unsupported(ref msg) => { + write!(f, "unsupported regex feature for DFAs: {}", msg) + } + } + } +} + +/// An error that occurs when cache usage has become inefficient. +/// +/// One of the weaknesses of a lazy DFA is that it may need to clear its +/// cache repeatedly if it's not big enough. If this happens too much, then it +/// can slow searching down significantly. A mitigation to this is to use +/// heuristics to detect whether the cache is being used efficiently or not. +/// If not, then a lazy DFA can return a `CacheError`. +/// +/// The default configuration of a lazy DFA in this crate is +/// set such that a `CacheError` will never occur. Instead, +/// callers must opt into this behavior with settings like +/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count). +/// +/// When the `std` feature is enabled, this implements the `std::error::Error` +/// trait. +#[derive(Clone, Debug)] +pub struct CacheError(()); + +impl CacheError { + pub(crate) fn too_many_cache_clears() -> CacheError { + CacheError(()) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for CacheError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +impl core::fmt::Display for CacheError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "lazy DFA cache has been cleared too many times") + } +} diff --git a/vendor/regex-automata/src/hybrid/id.rs b/vendor/regex-automata/src/hybrid/id.rs new file mode 100644 index 000000000..a6fcde52e --- /dev/null +++ b/vendor/regex-automata/src/hybrid/id.rs @@ -0,0 +1,415 @@ +/// A state identifier especially tailored for lazy DFAs. +/// +/// A lazy state ID logically represents a pointer to a DFA state. In practice, +/// by limiting the number of DFA states it can address, it reserves some +/// bits of its representation to encode some additional information. That +/// additional information is called a "tag." That tag is used to record +/// whether the state it points to is an unknown, dead, quit, start or match +/// state. +/// +/// When implementing a low level search routine with a lazy DFA, it is +/// necessary to query the type of the current state to know what to do: +/// +/// * **Unknown** - The state has not yet been computed. The +/// parameters used to get this state ID must be re-passed to +/// [`DFA::next_state`](crate::hybrid::dfa::DFA), which will never return an +/// unknown state ID. +/// * **Dead** - A dead state only has transitions to itself. It indicates that +/// the search cannot do anything else and should stop with whatever result it +/// has. +/// * **Quit** - A quit state indicates that the automaton could not answer +/// whether a match exists or not. Correct search implementations must return a +/// [`MatchError::Quit`](crate::MatchError::Quit). +/// * **Start** - A start state indicates that the automaton will begin +/// searching at a starting state. Branching on this isn't required for +/// correctness, but a common optimization is to use this to more quickly look +/// for a prefix. +/// * **Match** - A match state indicates that a match has been found. +/// Depending on the semantics of your search implementation, it may either +/// continue until the end of the haystack or a dead state, or it might quit +/// and return the match immediately. +/// +/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate +/// can be used to determine if a tag exists at all. This is useful to avoid +/// branching on all of the above types for every byte searched. +/// +/// # Example +/// +/// This example shows how `LazyStateID` can be used to implement a correct +/// search routine with minimal branching. In particular, this search routine +/// implements "leftmost" matching, which means that it doesn't immediately +/// stop once a match is found. Instead, it continues until it reaches a dead +/// state. +/// +/// Notice also how a correct search implementation deals with +/// [`CacheError`](crate::hybrid::CacheError)s returned by some of +/// the lazy DFA routines. When a `CacheError` occurs, it returns +/// [`MatchError::GaveUp`](crate::MatchError::GaveUp). +/// +/// ``` +/// use regex_automata::{ +/// hybrid::dfa::{Cache, DFA}, +/// HalfMatch, MatchError, PatternID, +/// }; +/// +/// fn find_leftmost_first( +/// dfa: &DFA, +/// cache: &mut Cache, +/// haystack: &[u8], +/// ) -> Result<Option<HalfMatch>, MatchError> { +/// // The start state is determined by inspecting the position and the +/// // initial bytes of the haystack. Note that start states can never +/// // be match states (since DFAs in this crate delay matches by 1 +/// // byte), so we don't need to check if the start state is a match. +/// let mut sid = dfa.start_state_forward( +/// cache, None, haystack, 0, haystack.len(), +/// ).map_err(|_| MatchError::GaveUp { offset: 0 })?; +/// let mut last_match = None; +/// // Walk all the bytes in the haystack. We can quit early if we see +/// // a dead or a quit state. The former means the automaton will +/// // never transition to any other state. The latter means that the +/// // automaton entered a condition in which its search failed. +/// for (i, &b) in haystack.iter().enumerate() { +/// sid = dfa +/// .next_state(cache, sid, b) +/// .map_err(|_| MatchError::GaveUp { offset: i })?; +/// if sid.is_tagged() { +/// if sid.is_match() { +/// last_match = Some(HalfMatch::new( +/// dfa.match_pattern(cache, sid, 0), +/// i, +/// )); +/// } else if sid.is_dead() { +/// return Ok(last_match); +/// } else if sid.is_quit() { +/// // It is possible to enter into a quit state after +/// // observing a match has occurred. In that case, we +/// // should return the match instead of an error. +/// if last_match.is_some() { +/// return Ok(last_match); +/// } +/// return Err(MatchError::Quit { byte: b, offset: i }); +/// } +/// // Implementors may also want to check for start states and +/// // handle them differently for performance reasons. But it is +/// // not necessary for correctness. +/// } +/// } +/// // Matches are always delayed by 1 byte, so we must explicitly walk +/// // the special "EOI" transition at the end of the search. +/// sid = dfa +/// .next_eoi_state(cache, sid) +/// .map_err(|_| MatchError::GaveUp { offset: haystack.len() })?; +/// if sid.is_match() { +/// last_match = Some(HalfMatch::new( +/// dfa.match_pattern(cache, sid, 0), +/// haystack.len(), +/// )); +/// } +/// Ok(last_match) +/// } +/// +/// // We use a greedy '+' operator to show how the search doesn't just stop +/// // once a match is detected. It continues extending the match. Using +/// // '[a-z]+?' would also work as expected and stop the search early. +/// // Greediness is built into the automaton. +/// let dfa = DFA::new(r"[a-z]+")?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 10); +/// +/// // Here's another example that tests our handling of the special +/// // EOI transition. This will fail to find a match if we don't call +/// // 'next_eoi_state' at the end of the search since the match isn't found +/// // until the final byte in the haystack. +/// let dfa = DFA::new(r"[0-9]{4}")?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 15); +/// +/// // And note that our search implementation above automatically works +/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects +/// // the appropriate pattern ID for us. +/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; +/// let mut cache = dfa.create_cache(); +/// let haystack = "123 foobar 4567".as_bytes(); +/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 1); +/// assert_eq!(mat.offset(), 3); +/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 0); +/// assert_eq!(mat.offset(), 7); +/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap(); +/// assert_eq!(mat.pattern().as_usize(), 1); +/// assert_eq!(mat.offset(), 5); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive( + Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, +)] +pub struct LazyStateID(u32); + +impl LazyStateID { + #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] + const MAX_BIT: usize = 31; + + #[cfg(target_pointer_width = "16")] + const MAX_BIT: usize = 15; + + const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT); + const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1); + const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2); + const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3); + const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4); + const MAX: usize = LazyStateID::MASK_MATCH - 1; + + /// Create a new lazy state ID. + /// + /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns + /// an error. + #[inline] + pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> { + if id > LazyStateID::MAX { + return Err(LazyStateIDError { attempted: id as u64 }); + } + Ok(LazyStateID::new_unchecked(id)) + } + + /// Create a new lazy state ID without checking whether the given value + /// exceeds [`LazyStateID::MAX`]. + /// + /// While this is unchecked, providing an incorrect value must never + /// sacrifice memory safety. + #[inline] + const fn new_unchecked(id: usize) -> LazyStateID { + LazyStateID(id as u32) + } + + /// Return this lazy state ID as its raw value if and only if it is not + /// tagged (and thus not an unknown, dead, quit, start or match state ID). + #[inline] + pub(crate) fn as_usize(&self) -> Option<usize> { + if self.is_tagged() { + None + } else { + Some(self.as_usize_unchecked()) + } + } + + /// Return this lazy state ID as an untagged `usize`. + /// + /// If this lazy state ID is tagged, then the usize returned is the state + /// ID without the tag. If the ID was not tagged, then the usize returned + /// is equivalent to the state ID. + #[inline] + pub(crate) fn as_usize_untagged(&self) -> usize { + self.as_usize_unchecked() & LazyStateID::MAX + } + + /// Return this lazy state ID as its raw internal `usize` value, which may + /// be tagged (and thus greater than LazyStateID::MAX). + #[inline] + pub(crate) const fn as_usize_unchecked(&self) -> usize { + self.0 as usize + } + + #[inline] + pub(crate) const fn to_unknown(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN, + ) + } + + #[inline] + pub(crate) const fn to_dead(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_DEAD, + ) + } + + #[inline] + pub(crate) const fn to_quit(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_QUIT, + ) + } + + /// Return this lazy state ID as a state ID that is tagged as a start + /// state. + #[inline] + pub(crate) const fn to_start(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_START, + ) + } + + /// Return this lazy state ID as a lazy state ID that is tagged as a match + /// state. + #[inline] + pub(crate) const fn to_match(&self) -> LazyStateID { + LazyStateID::new_unchecked( + self.as_usize_unchecked() | LazyStateID::MASK_MATCH, + ) + } + + /// Return true if and only if this lazy state ID is tagged. + /// + /// When a lazy state ID is tagged, then one can conclude that it is one + /// of a match, start, dead, quit or unknown state. + #[inline] + pub const fn is_tagged(&self) -> bool { + self.as_usize_unchecked() > LazyStateID::MAX + } + + /// Return true if and only if this represents a lazy state ID that is + /// "unknown." That is, the state has not yet been created. When a caller + /// sees this state ID, it generally means that a state has to be computed + /// in order to proceed. + #[inline] + pub const fn is_unknown(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0 + } + + /// Return true if and only if this represents a dead state. A dead state + /// is a state that can never transition to any other state except the + /// dead state. When a dead state is seen, it generally indicates that a + /// search should stop. + #[inline] + pub const fn is_dead(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0 + } + + /// Return true if and only if this represents a quit state. A quit state + /// is a state that is representationally equivalent to a dead state, + /// except it indicates the automaton has reached a point at which it can + /// no longer determine whether a match exists or not. In general, this + /// indicates an error during search and the caller must either pass this + /// error up or use a different search technique. + #[inline] + pub const fn is_quit(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0 + } + + /// Return true if and only if this lazy state ID has been tagged as a + /// start state. + #[inline] + pub const fn is_start(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_START > 0 + } + + /// Return true if and only if this lazy state ID has been tagged as a + /// match state. + #[inline] + pub const fn is_match(&self) -> bool { + self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0 + } +} + +/// This error occurs when a lazy state ID could not be constructed. +/// +/// This occurs when given an integer exceeding the maximum lazy state ID +/// value. +/// +/// When the `std` feature is enabled, this implements the `Error` trait. +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct LazyStateIDError { + attempted: u64, +} + +impl LazyStateIDError { + /// Returns the value that failed to constructed a lazy state ID. + pub(crate) fn attempted(&self) -> u64 { + self.attempted + } +} + +#[cfg(feature = "std")] +impl std::error::Error for LazyStateIDError {} + +impl core::fmt::Display for LazyStateIDError { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "failed to create LazyStateID from {:?}, which exceeds {:?}", + self.attempted(), + LazyStateID::MAX, + ) + } +} + +/// Represents the current state of an overlapping search. +/// +/// This is used for overlapping searches since they need to know something +/// about the previous search. For example, when multiple patterns match at the +/// same position, this state tracks the last reported pattern so that the next +/// search knows whether to report another matching pattern or continue with +/// the search at the next position. Additionally, it also tracks which state +/// the last search call terminated in. +/// +/// This type provides no introspection capabilities. The only thing a caller +/// can do is construct it and pass it around to permit search routines to use +/// it to track state. +/// +/// Callers should always provide a fresh state constructed via +/// [`OverlappingState::start`] when starting a new search. Reusing state from +/// a previous search may result in incorrect results. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OverlappingState { + /// The state ID of the state at which the search was in when the call + /// terminated. When this is a match state, `last_match` must be set to a + /// non-None value. + /// + /// A `None` value indicates the start state of the corresponding + /// automaton. We cannot use the actual ID, since any one automaton may + /// have many start states, and which one is in use depends on several + /// search-time factors. + id: Option<LazyStateID>, + /// Information associated with a match when `id` corresponds to a match + /// state. + last_match: Option<StateMatch>, +} + +/// Internal state about the last match that occurred. This records both the +/// offset of the match and the match index. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub(crate) struct StateMatch { + /// The index into the matching patterns for the current match state. + pub(crate) match_index: usize, + /// The offset in the haystack at which the match occurred. This is used + /// when reporting multiple matches at the same offset. That is, when + /// an overlapping search runs, the first thing it checks is whether it's + /// already in a match state, and if so, whether there are more patterns + /// to report as matches in that state. If so, it increments `match_index` + /// and returns the pattern and this offset. Once `match_index` exceeds the + /// number of matching patterns in the current state, the search continues. + pub(crate) offset: usize, +} + +impl OverlappingState { + /// Create a new overlapping state that begins at the start state of any + /// automaton. + pub fn start() -> OverlappingState { + OverlappingState { id: None, last_match: None } + } + + pub(crate) fn id(&self) -> Option<LazyStateID> { + self.id + } + + pub(crate) fn set_id(&mut self, id: LazyStateID) { + self.id = Some(id); + } + + pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> { + self.last_match.as_mut() + } + + pub(crate) fn set_last_match(&mut self, last_match: StateMatch) { + self.last_match = Some(last_match); + } +} diff --git a/vendor/regex-automata/src/hybrid/mod.rs b/vendor/regex-automata/src/hybrid/mod.rs new file mode 100644 index 000000000..4c8ca7ebe --- /dev/null +++ b/vendor/regex-automata/src/hybrid/mod.rs @@ -0,0 +1,179 @@ +/*! +A module for building and searching with lazy determinstic finite automata +(DFAs). + +Like other modules in this crate, lazy DFAs support a rich regex syntax with +Unicode features. The key feature of a lazy DFA is that it builds itself +incrementally during search, and never uses more than a configured capacity of +memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache" +in which the actual DFA's transition table is stored. + +If you're looking for fully compiled DFAs, then please see the top-level +[`dfa` module](crate::dfa). + +# Overview + +This section gives a brief overview of the primary types in this module: + +* A [`regex::Regex`] provides a way to search for matches of a regular +expression using lazy DFAs. This includes iterating over matches with both the +start and end positions of each match. +* A [`dfa::DFA`] provides direct low level access to a lazy DFA. + +# Example: basic regex searching + +This example shows how to compile a regex using the default configuration +and then use it to find matches in a byte string: + +``` +use regex_automata::{hybrid::regex::Regex, MultiMatch}; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; +let mut cache = re.create_cache(); + +let text = b"2018-12-24 2016-10-08"; +let matches: Vec<MultiMatch> = + re.find_leftmost_iter(&mut cache, text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(0, 0, 10), + MultiMatch::must(0, 11, 21), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Example: searching with regex sets + +The lazy DFAs in this module all fully support searching with multiple regexes +simultaneously. You can use this support with standard leftmost-first style +searching to find non-overlapping matches: + +``` +use regex_automata::{hybrid::regex::Regex, MultiMatch}; + +let re = Regex::new_many(&[r"\w+", r"\S+"])?; +let mut cache = re.create_cache(); + +let text = b"@foo bar"; +let matches: Vec<MultiMatch> = + re.find_leftmost_iter(&mut cache, text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(1, 0, 4), + MultiMatch::must(0, 5, 8), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +Or use overlapping style searches to find all possible occurrences: + +``` +use regex_automata::{hybrid::{dfa, regex::Regex}, MatchKind, MultiMatch}; + +// N.B. For overlapping searches, we need the underlying lazy DFA to report all +// possible matches. +let re = Regex::builder() + .dfa(dfa::Config::new().match_kind(MatchKind::All)) + .build_many(&[r"\w{3}", r"\S{3}"])?; +let mut cache = re.create_cache(); + +let text = b"@foo bar"; +let matches: Vec<MultiMatch> = + re.find_overlapping_iter(&mut cache, text).collect(); +assert_eq!(matches, vec![ + MultiMatch::must(1, 0, 3), + MultiMatch::must(0, 1, 4), + MultiMatch::must(1, 1, 4), + MultiMatch::must(0, 5, 8), + MultiMatch::must(1, 5, 8), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# When should I use this? + +Generally speaking, if you can abide the use of mutable state during search, +and you don't need things like capturing groups or Unicode word boundary +support in non-ASCII text, then a lazy DFA is likely a robust choice with +respect to both search speed and memory usage. Note however that its speed +may be worse than a general purpose regex engine if you don't select a good +[prefilter](crate::util::prefilter). + +If you know ahead of time that your pattern would result in a very large DFA +if it was fully compiled, it may be better to use an NFA simulation instead +of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA +to something that is big enough to hold the state machine (likely through +experimentation). The issue here is that if the cache is too small, then it +could wind up being reset too frequently and this might decrease searching +speed significantly. + +# Differences with fully compiled DFAs + +A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a +[`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities +(and similarly for their underlying DFAs), but they achieve them through +different means. The main difference is that a hybrid or "lazy" regex builds +its DFA lazily during search, where as a fully compiled regex will build its +DFA at construction time. While building a DFA at search time might sound like +it's slow, it tends to work out where most bytes seen during a search will +reuse pre-built parts of the DFA and thus can be almost as fast as a fully +compiled DFA. The main downside is that searching requires mutable space to +store the DFA, and, in the worst case, a search can result in a new state being +created for each byte seen, which would make searching quite a bit slower. + +A fully compiled DFA never has to worry about searches being slower once +it's built. (Aside from, say, the transition table being so large that it +is subject to harsh CPU cache effects.) However, of course, building a full +DFA can be quite time consuming and memory hungry. Particularly when it's +so easy to build large DFAs when Unicode mode is enabled. + +A lazy DFA strikes a nice balance _in practice_, particularly in the +presence of Unicode mode, by only building what is needed. It avoids the +worst case exponential time complexity of DFA compilation by guaranteeing that +it will only build at most one state per byte searched. While the worst +case here can lead to a very high constant, it will never be exponential. + +# Syntax + +This module supports the same syntax as the `regex` crate, since they share the +same parser. You can find an exhaustive list of supported syntax in the +[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). + +There are two things that are not supported by the lazy DFAs in this module: + +* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top +of them) can only find the offsets of an entire match, but cannot resolve +the offsets of each capturing group. This is because DFAs do not have the +expressive power necessary. +* Unicode word boundaries. These present particularly difficult challenges for +DFA construction and would result in an explosion in the number of states. +One can enable [`dfa::Config::unicode_word_boundary`] though, which provides +heuristic support for Unicode word boundaries that only works on ASCII text. +Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work +on any input. + +There are no plans to lift either of these limitations. + +Note that these restrictions are identical to the restrictions on fully +compiled DFAs. + +# Support for `alloc`-only + +This crate comes with `alloc` and `std` features that are enabled by default. +One can disable the `std` feature and still use the full API of a lazy DFA. +(You should use `std` when possible, since it permits providing implementations +of the `std::error::Error` trait, and does enable some minor internal +optimizations.) + +This module does require at least the `alloc` feature though. It is not +available in any capacity without `alloc`. +*/ + +pub use self::{ + error::{BuildError, CacheError}, + id::{LazyStateID, OverlappingState}, +}; + +pub mod dfa; +mod error; +mod id; +pub mod regex; +mod search; diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs new file mode 100644 index 000000000..7cc6b9064 --- /dev/null +++ b/vendor/regex-automata/src/hybrid/regex.rs @@ -0,0 +1,2124 @@ +/*! +A lazy DFA backed `Regex`. + +This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience +routines you might have come to expect, such as finding a match and iterating +over all non-overlapping matches. This `Regex` type is limited in its +capabilities to what a lazy DFA can provide. Therefore, APIs involving +capturing groups, for example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::hybrid) for examples. +*/ + +use core::borrow::Borrow; + +use alloc::boxed::Box; + +use crate::{ + hybrid::{ + dfa::{self, DFA}, + error::BuildError, + OverlappingState, + }, + nfa::thompson, + util::{ + matchtypes::{MatchError, MatchKind, MultiMatch}, + prefilter::{self, Prefilter}, + }, +}; + +/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs") +/// for searching. +/// +/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a +/// "reverse" DFA. The forward DFA is responsible for detecting the end of +/// a match while the reverse DFA is responsible for detecting the start +/// of a match. Thus, in order to find the bounds of any given match, a +/// forward search must first be run followed by a reverse search. A match +/// found by the forward DFA guarantees that the reverse DFA will also find +/// a match. +/// +/// A `Regex` can also have a prefilter set via the +/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is +/// enabled. +/// +/// # Earliest vs Leftmost vs Overlapping +/// +/// The search routines exposed on a `Regex` reflect three different ways +/// of searching: +/// +/// * "earliest" means to stop as soon as a match has been detected. +/// * "leftmost" means to continue matching until the underlying +/// automaton cannot advance. This reflects "standard" searching you +/// might be used to in other regex engines. e.g., This permits +/// non-greedy and greedy searching to work as you would expect. +/// * "overlapping" means to find all possible matches, even if they +/// overlap. +/// +/// Generally speaking, when doing an overlapping search, you'll want to +/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using +/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is +/// likely to lead to odd behavior since `LeftmostFirst` specifically omits +/// some matches that can never be reported due to its semantics. +/// +/// The following example shows the differences between how these different +/// types of searches impact looking for matches of `[a-z]+` in the +/// haystack `abc`. +/// +/// ``` +/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch}; +/// +/// let pattern = r"[a-z]+"; +/// let haystack = "abc".as_bytes(); +/// +/// // With leftmost-first semantics, we test "earliest" and "leftmost". +/// let re = regex::Builder::new() +/// .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst)) +/// .build(pattern)?; +/// let mut cache = re.create_cache(); +/// +/// // "earliest" searching isn't impacted by greediness +/// let mut it = re.find_earliest_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// // "leftmost" searching supports greediness (and non-greediness) +/// let mut it = re.find_leftmost_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// // For overlapping, we want "all" match kind semantics. +/// let re = regex::Builder::new() +/// .dfa(dfa::Config::new().match_kind(MatchKind::All)) +/// .build(pattern)?; +/// let mut cache = re.create_cache(); +/// +/// // In the overlapping search, we find all three possible matches +/// // starting at the beginning of the haystack. +/// let mut it = re.find_overlapping_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Fallibility +/// +/// In non-default configurations, the lazy DFAs generated in this module may +/// return an error during a search. (Currently, the only way this happens is +/// if quit bytes are added, Unicode word boundaries are heuristically enabled, +/// or if the cache is configured to "give up" on a search if it has been +/// cleared too many times. All of these are turned off by default, which means +/// a search can never fail in the default configuration.) For convenience, +/// the main search routines, like [`find_leftmost`](Regex::find_leftmost), +/// will panic if an error occurs. However, if you need to use DFAs which may +/// produce an error at search time, then there are fallible equivalents of +/// all search routines. For example, for `find_leftmost`, its fallible analog +/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed +/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the +/// infallible routines simply return `Option<MultiMatch>`. +/// +/// # Example +/// +/// This example shows how to cause a search to terminate if it sees a +/// `\n` byte, and handle the error returned. This could be useful if, for +/// example, you wanted to prevent a user supplied pattern from matching +/// across a line boundary. +/// +/// ``` +/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError}; +/// +/// let re = Regex::builder() +/// .dfa(dfa::Config::new().quit(b'\n', true)) +/// .build(r"foo\p{any}+bar")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = "foo\nbar".as_bytes(); +/// // Normally this would produce a match, since \p{any} contains '\n'. +/// // But since we instructed the automaton to enter a quit state if a +/// // '\n' is observed, this produces a match error instead. +/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; +/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err(); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Debug)] +pub struct Regex { + /// An optional prefilter that is passed down to the lazy DFA search + /// routines when present. By default, no prefilter is set. + pre: Option<Box<dyn Prefilter>>, + /// The forward lazy DFA. This can only find the end of a match. + forward: DFA, + /// The reverse lazy DFA. This can only find the start of a match. + /// + /// This is built with 'all' match semantics (instead of leftmost-first) + /// so that it always finds the longest possible match (which corresponds + /// to the leftmost starting position). It is also compiled as an anchored + /// matcher and has 'starts_for_each_pattern' enabled. Including starting + /// states for each pattern is necessary to ensure that we only look for + /// matches of a pattern that matched in the forward direction. Otherwise, + /// we might wind up finding the "leftmost" starting position of a totally + /// different pattern! + reverse: DFA, + /// Whether iterators on this type should advance by one codepoint or one + /// byte when an empty match is seen. + utf8: bool, +} + +/// Convenience routines for regex and cache construction. +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<Regex, BuildError> { + Regex::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.find_leftmost_iter( + /// &mut cache, + /// b"abc 1 foo 4567 0 quux", + /// ); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex, BuildError> { + Regex::builder().build_many(patterns) + } + + /// Return a default configuration for a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a regex. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode for `Regex` iteration. + /// When UTF-8 mode is disabled, the position immediately following an + /// empty match is where the next search begins, instead of the next + /// position of a UTF-8 encoded codepoint. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// nfa::thompson, + /// MultiMatch, SyntaxConfig, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .syntax(SyntaxConfig::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(MultiMatch::must(0, 1, 9)); + /// let got = re.find_leftmost(&mut cache, haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this `Regex`. + /// + /// The cache returned should only be used for searches for this + /// `Regex`. If you want to reuse the cache for another `Regex`, then + /// you must call [`Cache::reset`] with that `Regex` (or, equivalently, + /// [`Regex::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this `Regex` (and only this `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 2)), + /// re1.find_leftmost(&mut cache, "Δ".as_bytes()), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re2.find_leftmost(&mut cache, "☃".as_bytes()), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + self.forward().reset_cache(&mut cache.forward); + self.reverse().reset_cache(&mut cache.reverse); + } +} + +/// Standard infallible search routines for finding and iterating over matches. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_is_match`](Regex::try_is_match). + /// + /// # Example + /// + /// ``` + /// use regex_automata::hybrid::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar")); + /// assert_eq!(false, re.is_match(&mut cache, b"foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool { + self.try_is_match(cache, haystack).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest`](Regex::try_find_earliest). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// // Normally, the leftmost first match would greedily consume as many + /// // decimal digits as it could. But a match is detected as soon as one + /// // digit is seen. + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 4)), + /// re.find_earliest(&mut cache, b"foo12345"), + /// ); + /// + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the "earliest" match semantics detect a match earlier. + /// let re = Regex::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 1)), + /// re.find_earliest(&mut cache, b"abc"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_earliest( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Option<MultiMatch> { + self.try_find_earliest(cache, haystack).unwrap() + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// // Greediness is applied appropriately when compared to find_earliest. + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 11)), + /// re.find_leftmost(&mut cache, b"zzzfoo12345zzz"), + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re.find_leftmost(&mut cache, b"abc"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_leftmost( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Option<MultiMatch> { + self.try_find_leftmost(cache, haystack).unwrap() + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping`](Regex::try_find_overlapping). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex, OverlappingState}, + /// MatchKind, + /// MultiMatch, + /// }; + /// + /// let re = Regex::builder() + /// .dfa(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(MultiMatch::must(1, 0, 4)); + /// let got = re.find_overlapping(&mut cache, haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(MultiMatch::must(0, 1, 4)); + /// let got = re.find_overlapping(&mut cache, haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_overlapping( + &self, + cache: &mut Cache, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Option<MultiMatch> { + self.try_find_overlapping(cache, haystack, state).unwrap() + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). + /// + /// # Example + /// + /// This example shows how to run an "earliest" iterator. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::new("[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let haystack = "123".as_bytes(); + /// + /// // Normally, a standard leftmost iterator would return a single + /// // match, but since "earliest" detects matches earlier, we get + /// // three matches. + /// let mut it = re.find_earliest_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_earliest_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindEarliestMatches<'r, 'c, 't> { + FindEarliestMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = b"foo1 foo12 foo123"; + /// let matches: Vec<MultiMatch> = re + /// .find_leftmost_iter(&mut cache, text) + /// .collect(); + /// assert_eq!(matches, vec![ + /// MultiMatch::must(0, 0, 4), + /// MultiMatch::must(0, 5, 10), + /// MultiMatch::must(0, 11, 17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_leftmost_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex}, + /// MatchKind, + /// MultiMatch, + /// }; + /// + /// let re = Regex::builder() + /// .dfa(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = re.create_cache(); + /// let haystack = "@foo".as_bytes(); + /// + /// let mut it = re.find_overlapping_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_overlapping_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindOverlappingMatches<'r, 'c, 't> { + FindOverlappingMatches::new(self, cache, haystack) + } +} + +/// Lower level infallible search routines that permit controlling where +/// the search starts and ends in a particular sequence. This is useful for +/// executing searches that need to take surrounding context into account. This +/// is required for correctly implementing iteration because of look-around +/// operators (`^`, `$`, `\b`). +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_is_match_at`](Regex::try_is_match_at). + pub fn is_match_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> bool { + self.try_is_match_at(cache, haystack, start, end).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_at`](Regex::try_find_earliest_at). + pub fn find_earliest_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option<MultiMatch> { + self.try_find_earliest_at(cache, haystack, start, end).unwrap() + } + + /// Returns the same as `find_leftmost`, but starts the search at the given + /// offset. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches within the + /// same haystack, which cannot be done correctly by simply providing a + /// subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). + pub fn find_leftmost_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option<MultiMatch> { + self.try_find_leftmost_at(cache, haystack, start, end).unwrap() + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). + pub fn find_overlapping_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Option<MultiMatch> { + self.try_find_overlapping_at(cache, haystack, start, end, state) + .unwrap() + } +} + +/// Fallible search routines. These may return an error when the underlying +/// lazy DFAs have been configured in a way that permits them to fail during a +/// search. +/// +/// Errors during search only occur when the lazy DFA has been explicitly +/// configured to do so, usually by specifying one or more "quit" bytes or by +/// heuristically enabling Unicode word boundaries. +/// +/// Errors will never be returned using the default configuration. So these +/// fallible routines are only needed for particular configurations. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match`](Regex::is_match). + pub fn try_is_match( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<bool, MatchError> { + self.try_is_match_at(cache, haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest`](Regex::find_earliest). + pub fn try_find_earliest( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_earliest_at(cache, haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost`](Regex::find_leftmost). + pub fn try_find_leftmost( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_leftmost_at(cache, haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping`](Regex::find_overlapping). + pub fn try_find_overlapping( + &self, + cache: &mut Cache, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_iter`](Regex::find_earliest_iter). + pub fn try_find_earliest_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindEarliestMatches<'r, 'c, 't> { + TryFindEarliestMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_iter`](Regex::find_leftmost_iter). + pub fn try_find_leftmost_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 'c, 't> { + TryFindLeftmostMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_iter`](Regex::find_overlapping_iter). + pub fn try_find_overlapping_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 'c, 't> { + TryFindOverlappingMatches::new(self, cache, haystack) + } +} + +/// Lower level fallible search routines that permit controlling where the +/// search starts and ends in a particular sequence. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match_at`](Regex::is_match_at). + pub fn try_is_match_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<bool, MatchError> { + self.forward() + .find_leftmost_fwd_at( + &mut cache.forward, + self.scanner().as_mut(), + None, + haystack, + start, + end, + ) + .map(|x| x.is_some()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_at`](Regex::find_earliest_at). + pub fn try_find_earliest_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_earliest_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + ) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_at`](Regex::find_leftmost_at). + pub fn try_find_leftmost_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_leftmost_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + ) + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_at`](Regex::find_overlapping_at). + pub fn try_find_overlapping_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_overlapping_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + state, + ) + } +} + +impl Regex { + #[inline(always)] + fn try_find_earliest_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa + .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // earliest case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = rdfa + .find_earliest_rev_at(rcache, None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + #[inline(always)] + fn try_find_leftmost_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa + .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = rdfa + .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + #[inline(always)] + fn try_find_overlapping_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa.find_overlapping_fwd_at( + fcache, pre, None, haystack, start, end, state, + )? { + None => return Ok(None), + Some(end) => end, + }; + // Unlike the leftmost cases, the reverse overlapping search may match + // a different pattern than the forward search. See test failures when + // using `None` instead of `Some(end.pattern())` below. Thus, we must + // run our reverse search using the pattern that matched in the forward + // direction. + let start = rdfa + .find_leftmost_rev_at( + rcache, + Some(end.pattern()), + haystack, + 0, + end.offset(), + )? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl Regex { + /// Return the underlying lazy DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn forward(&self) -> &DFA { + &self.forward + } + + /// Return the underlying lazy DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn reverse(&self) -> &DFA { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_count()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_count(&self) -> usize { + assert_eq!( + self.forward().pattern_count(), + self.reverse().pattern_count() + ); + self.forward().pattern_count() + } + + /// Convenience function for returning this regex's prefilter as a trait + /// object. + /// + /// If this regex doesn't have a prefilter, then `None` is returned. + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + self.pre.as_ref().map(|x| &**x) + } + + /// Attach the given prefilter to this regex. + pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) { + self.pre = pre; + } + + /// Convenience function for returning a prefilter scanner. + fn scanner(&self) -> Option<prefilter::Scanner> { + self.prefilter().map(prefilter::Scanner::new) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>); + +impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindEarliestMatches<'r, 'c, 't> { + FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>); + +impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all overlapping matches for a particular infallible +/// search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindOverlappingMatches<'r, 'c, 't>( + TryFindOverlappingMatches<'r, 'c, 't>, +); + +impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindOverlappingMatches<'r, 'c, 't> { + FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindEarliestMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindEarliestMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindEarliestMatches { + re, + cache, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_earliest_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindLeftmostMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindLeftmostMatches { + re, + cache, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_leftmost_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all overlapping matches for a particular fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindOverlappingMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + state: OverlappingState, +} + +impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindOverlappingMatches { + re, + cache, + scanner, + text, + last_end: 0, + state: OverlappingState::start(), + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_overlapping_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + &mut self.state, + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + // Unlike the non-overlapping case, we're OK with empty matches at this + // level. In particular, the overlapping search algorithm is itself + // responsible for ensuring that progress is always made. + self.last_end = m.end(); + Some(Ok(m)) + } +} + +/// A cache represents a partially computed forward and reverse DFA. +/// +/// A cache is the key component that differentiates a classical DFA and a +/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a +/// complete transition table that can handle all possible inputs, a hybrid +/// NFA/DFA starts with an empty transition table and builds only the parts +/// required during search. The parts that are built are stored in a cache. For +/// this reason, a cache is a required parameter for nearly every operation on +/// a [`Regex`]. +/// +/// Caches can be created from their corresponding `Regex` via +/// [`Regex::create_cache`]. A cache can only be used with either the `Regex` +/// that created it, or the `Regex` that was most recently used to reset it +/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in +/// panics or incorrect results. +#[derive(Debug, Clone)] +pub struct Cache { + forward: dfa::Cache, + reverse: dfa::Cache, +} + +impl Cache { + /// Create a new cache for the given `Regex`. + /// + /// The cache returned should only be used for searches for the given + /// `Regex`. If you want to reuse the cache for another `Regex`, then you + /// must call [`Cache::reset`] with that `Regex`. + pub fn new(re: &Regex) -> Cache { + let forward = dfa::Cache::new(re.forward()); + let reverse = dfa::Cache::new(re.reverse()); + Cache { forward, reverse } + } + + /// Reset this cache such that it can be used for searching with the given + /// `Regex` (and only that `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 2)), + /// re1.find_leftmost(&mut cache, "Δ".as_bytes()), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re2.find_leftmost(&mut cache, "☃".as_bytes()), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &Regex) { + self.forward.reset(re.forward()); + self.reverse.reset(re.reverse()); + } + + /// Returns the heap memory usage, in bytes, as a sum of the forward and + /// reverse lazy DFA caches. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.forward.memory_usage() + self.reverse.memory_usage() + } + + /// Return references to the forward and reverse caches, respectively. + pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) { + (&self.forward, &self.reverse) + } + + /// Return mutable references to the forward and reverse caches, + /// respectively. + pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) { + (&mut self.forward, &mut self.reverse) + } +} + +/// The configuration used for compiling a hybrid NFA/DFA regex. +/// +/// A regex configuration is a simple data object that is typically used with +/// [`Builder::configure`]. +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + utf8: Option<bool>, +} + +impl Config { + /// Return a new default regex compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Whether to enable UTF-8 mode or not. + /// + /// When UTF-8 mode is enabled (the default) and an empty match is seen, + /// the iterators on [`Regex`] will always start the next search at the + /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 + /// mode is disabled, such searches are begun at the next byte offset. + /// + /// If this mode is enabled and invalid UTF-8 is given to search, then + /// behavior is unspecified. + /// + /// Generally speaking, one should enable this when + /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) + /// and + /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) + /// are enabled, and disable it otherwise. + /// + /// # Example + /// + /// This example demonstrates the differences between when this option is + /// enabled and disabled. The differences only arise when the regex can + /// return matches of length zero. + /// + /// In this first snippet, we show the results when UTF-8 mode is disabled. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And in this snippet, we execute the same search on the same haystack, + /// but with UTF-8 mode enabled. Notice that byte offsets that would + /// otherwise split the encoding of `☃` are not returned. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(true)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Returns true if and only if this configuration has UTF-8 mode enabled. + /// + /// When UTF-8 mode is enabled and an empty match is seen, the iterators on + /// [`Regex`] will always start the next search at the next UTF-8 encoded + /// codepoint. When UTF-8 mode is disabled, such searches are begun at the + /// next byte offset. + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { utf8: o.utf8.or(self.utf8) } + } +} + +/// A builder for a regex based on a hybrid NFA/DFA. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the lazy DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder +/// in that it permits fine grain configuration of the construction process. +/// The trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are three +/// different UTF-8 modes: +/// +/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the +/// pattern itself can contain sub-expressions that match invalid UTF-8. +/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) +/// controls whether the implicit unanchored prefix added to the NFA can +/// match through invalid UTF-8 or not. +/// * [`Config::utf8`] controls how the regex iterators themselves advance +/// the starting position of the next search when a match with zero length is +/// found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two hybrid NFA/DFAs, +/// where one is responsible for finding the end of a match and the other is +/// responsible for finding the start of a match. If you only need to detect +/// whether something matched, or only the end of a match, then you should use +/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper +/// than building two of them. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax, the NFA and +/// the regex itself. This is generally what you want for matching on +/// arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// }; +/// +/// let re = Regex::builder() +/// .configure(Regex::config().utf8(false)) +/// .syntax(SyntaxConfig::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(MultiMatch::must(0, 1, 9)); +/// let got = re.find_leftmost(&mut cache, haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. Notice also that the +/// // search was unanchored and skipped over invalid UTF-8. +/// // Disabling UTF-8 on the Thompson NFA permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + dfa: dfa::Builder, +} + +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { config: Config::default(), dfa: DFA::builder() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a regex from the given patterns. + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex, BuildError> { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + DFA::config() + .anchored(true) + .match_kind(MatchKind::All) + .starts_for_each_pattern(true), + ) + .thompson(thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse hybrid NFA/DFAs. + fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { + // The congruous method on DFA-backed regexes is exposed, but it's + // not clear this builder is useful here since lazy DFAs can't be + // serialized and there is only one type of them. + let utf8 = self.config.get_utf8(); + Regex { pre: None, forward, reverse, utf8 } + } + + /// Apply the given regex configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the lazy DFA compilation configuration for this builder using + /// [`dfa::Config`](dfa::Config). + /// + /// This permits setting things like whether Unicode word boundaries should + /// be heuristically supported or settings how the behavior of the cache. + pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +#[inline(always)] +fn next_unwrap( + item: Option<Result<MultiMatch, MatchError>>, +) -> Option<MultiMatch> { + match item { + None => None, + Some(Ok(m)) => Some(m), + Some(Err(err)) => panic!( + "unexpected regex search error: {}\n\ + to handle search errors, use try_ methods", + err, + ), + } +} diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs new file mode 100644 index 000000000..92760cee2 --- /dev/null +++ b/vendor/regex-automata/src/hybrid/search.rs @@ -0,0 +1,663 @@ +use crate::{ + hybrid::{ + dfa::{Cache, DFA}, + id::{LazyStateID, OverlappingState, StateMatch}, + }, + nfa::thompson, + util::{ + id::PatternID, + matchtypes::{HalfMatch, MatchError}, + prefilter, MATCH_OFFSET, + }, +}; + +#[inline(never)] +pub(crate) fn find_earliest_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() && pattern_id.is_none() { + find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end) + } else { + find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end) + } +} + +#[inline(never)] +pub(crate) fn find_leftmost_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should never use + // a prefilter. + if pre.is_some() && pattern_id.is_none() { + find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end) + } else { + find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end) + } +} + +#[inline(always)] +fn find_fwd( + mut pre: Option<&mut prefilter::Scanner>, + earliest: bool, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + haystack: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= haystack.len()); + assert!(end <= haystack.len()); + + // Why do this? This lets 'bytes[at]' work without bounds checks below. + // It seems the assert on 'end <= haystack.len()' above is otherwise + // not enough. Why not just make 'bytes' scoped this way anyway? Well, + // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' + // for resolving look-ahead. + let bytes = &haystack[..end]; + + let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?; + let mut last_match = None; + let mut at = start; + if let Some(ref mut pre) = pre { + // If a prefilter doesn't report false positives, then we don't need to + // touch the DFA at all. However, since all matches include the pattern + // ID, and the prefilter infrastructure doesn't report pattern IDs, we + // limit this optimization to cases where there is exactly one pattern. + // In that case, any match must be the 0th pattern. + if dfa.pattern_count() == 1 && !pre.reports_false_positives() { + return Ok(pre.next_candidate(bytes, at).into_option().map( + |offset| HalfMatch { pattern: PatternID::ZERO, offset }, + )); + } else if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } + while at < end { + if sid.is_tagged() { + sid = dfa + .next_state(cache, sid, bytes[at]) + .map_err(|_| gave_up(at))?; + at += 1; + } else { + // SAFETY: There are two safety invariants we need to uphold + // here in the loop below: that 'sid' is a valid state ID for + // this DFA, and that 'at' is a valid index into 'bytes'. For + // the former, we rely on the invariant that next_state* and + // start_state_forward always returns a valid state ID (given a + // valid state ID in the former case), and that we are only at this + // place in the code if 'sid' is untagged. Moreover, every call to + // next_state_untagged_unchecked below is guarded by a check that + // sid is untagged. For the latter safety invariant, we always + // guard unchecked access with a check that 'at' is less than + // 'end', where 'end == bytes.len()'. + // + // For justification, this gives us a ~10% bump in search time. + // This was used for a benchmark: + // + // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb + // + // With bounds checked: ~881.4ms. Without: ~775ms. For input, I + // used OpenSubtitles2018.raw.sample.medium.en. + let mut prev_sid = sid; + while at < end { + prev_sid = sid; + sid = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + at += 1; + if sid.is_tagged() { + break; + } + // SAFETY: we make four unguarded accesses to 'bytes[at]' + // below, and each are safe because we know that 'at + 4' is + // in bounds. Moreover, while we don't check whether 'sid' is + // untagged directly, we know it is because of the check above. + // And the unrolled loop below quits when the next state is not + // equal to the previous state. + // + // PERF: For justification for eliminating bounds checks, + // see above. For justification for the unrolling, we use + // two tests. The one above with regex '(?m)^.+$', and also + // '(?m)^.{40}$'. The former is kinda the best case for + // unrolling, and gives a 1.67 boost primarily because the DFA + // spends most of its time munching through the input in the + // same state. But the latter pattern rarely spends time in the + // same state through subsequent transitions, so unrolling is + // pretty much always ineffective in that it craps out on the + // first 'sid != next' check below. However, without unrolling, + // search is only 1.03 times faster than with unrolling on the + // latter pattern, which we deem to be an acceptable loss in + // favor of optimizing the more common case of having a "hot" + // state somewhere in the DFA. + while at + 4 < end { + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at += 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at += 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at += 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at += 1; + } + } + if sid.is_unknown() { + sid = dfa + .next_state(cache, prev_sid, bytes[at - 1]) + .map_err(|_| gave_up(at - 1))?; + } + } + if sid.is_tagged() { + if sid.is_start() { + if let Some(ref mut pre) = pre { + if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } + } else if sid.is_match() { + last_match = Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset: at - MATCH_OFFSET, + }); + if earliest { + return Ok(last_match); + } + } else if sid.is_dead() { + return Ok(last_match); + } else if sid.is_quit() { + if last_match.is_some() { + return Ok(last_match); + } + let offset = at - 1; + return Err(MatchError::Quit { byte: bytes[offset], offset }); + } else { + debug_assert!(sid.is_unknown()); + unreachable!("sid being unknown is a bug"); + } + } + } + // We are careful to use 'haystack' here, which contains the full context + // that we might want to inspect. + Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match)) +} + +#[inline(never)] +pub(crate) fn find_earliest_rev( + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + find_rev(true, dfa, cache, pattern_id, bytes, start, end) +} + +#[inline(never)] +pub(crate) fn find_leftmost_rev( + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + find_rev(false, dfa, cache, pattern_id, bytes, start, end) +} + +#[inline(always)] +fn find_rev( + earliest: bool, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + haystack: &[u8], + start: usize, + end: usize, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= haystack.len()); + assert!(end <= haystack.len()); + + // Why do this? This lets 'bytes[at]' work without bounds checks below. + // It seems the assert on 'end <= haystack.len()' above is otherwise + // not enough. Why not just make 'bytes' scoped this way anyway? Well, + // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' + // for resolving look-ahead. + let bytes = &haystack[start..]; + + let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?; + let mut last_match = None; + let mut at = end - start; + while at > 0 { + if sid.is_tagged() { + at -= 1; + sid = dfa + .next_state(cache, sid, bytes[at]) + .map_err(|_| gave_up(at))?; + } else { + // SAFETY: See comments in 'find_fwd' for both a safety argument + // and a justification from a performance perspective as to 1) why + // we elide bounds checks and 2) why we do a specialized version of + // unrolling below. + let mut prev_sid = sid; + while at > 0 && !sid.is_tagged() { + prev_sid = sid; + at -= 1; + while at > 3 { + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at -= 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at -= 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at -= 1; + let next = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + if sid != next { + break; + } + at -= 1; + } + sid = unsafe { + dfa.next_state_untagged_unchecked( + cache, + sid, + *bytes.get_unchecked(at), + ) + }; + } + if sid.is_unknown() { + sid = dfa + .next_state(cache, prev_sid, bytes[at]) + .map_err(|_| gave_up(at))?; + } + } + if sid.is_tagged() { + if sid.is_start() { + continue; + } else if sid.is_match() { + last_match = Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset: start + at + MATCH_OFFSET, + }); + if earliest { + return Ok(last_match); + } + } else if sid.is_dead() { + return Ok(last_match); + } else { + debug_assert!(sid.is_quit()); + if last_match.is_some() { + return Ok(last_match); + } + return Err(MatchError::Quit { byte: bytes[at], offset: at }); + } + } + } + Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match)) +} + +#[inline(never)] +pub(crate) fn find_overlapping_fwd( + pre: Option<&mut prefilter::Scanner>, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, + caller_state: &mut OverlappingState, +) -> Result<Option<HalfMatch>, MatchError> { + // Searching with a pattern ID is always anchored, so we should only ever + // use a prefilter when no pattern ID is given. + if pre.is_some() && pattern_id.is_none() { + find_overlapping_fwd_imp( + pre, + dfa, + cache, + pattern_id, + bytes, + start, + end, + caller_state, + ) + } else { + find_overlapping_fwd_imp( + None, + dfa, + cache, + pattern_id, + bytes, + start, + end, + caller_state, + ) + } +} + +#[inline(always)] +fn find_overlapping_fwd_imp( + mut pre: Option<&mut prefilter::Scanner>, + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + mut start: usize, + end: usize, + caller_state: &mut OverlappingState, +) -> Result<Option<HalfMatch>, MatchError> { + assert!(start <= end); + assert!(start <= bytes.len()); + assert!(end <= bytes.len()); + + let mut sid = match caller_state.id() { + None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?, + Some(sid) => { + if let Some(last) = caller_state.last_match() { + let match_count = dfa.match_count(cache, sid); + if last.match_index < match_count { + let m = HalfMatch { + pattern: dfa.match_pattern( + cache, + sid, + last.match_index, + ), + offset: last.offset, + }; + last.match_index += 1; + return Ok(Some(m)); + } + } + + // This is a subtle but critical detail. If the caller provides a + // non-None state ID, then it must be the case that the state ID + // corresponds to one set by this function. The state ID therefore + // corresponds to a match state, a dead state or some other state. + // However, "some other" state _only_ occurs when the input has + // been exhausted because the only way to stop before then is to + // see a match or a dead/quit state. + // + // If the input is exhausted or if it's a dead state, then + // incrementing the starting position has no relevance on + // correctness, since the loop below will either not execute + // at all or will immediately stop due to being in a dead state. + // (Once in a dead state it is impossible to leave it.) + // + // Therefore, the only case we need to consider is when + // caller_state is a match state. In this case, since our machines + // support the ability to delay a match by a certain number of + // bytes (to support look-around), it follows that we actually + // consumed that many additional bytes on our previous search. When + // the caller resumes their search to find subsequent matches, they + // will use the ending location from the previous match as the next + // starting point, which is `match_offset` bytes PRIOR to where + // we scanned to on the previous search. Therefore, we need to + // compensate by bumping `start` up by `MATCH_OFFSET` bytes. + // + // Incidentally, since MATCH_OFFSET is non-zero, this also makes + // dealing with empty matches convenient. Namely, callers needn't + // special case them when implementing an iterator. Instead, this + // ensures that forward progress is always made. + start += MATCH_OFFSET; + sid + } + }; + + let mut at = start; + while at < end { + let byte = bytes[at]; + sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?; + at += 1; + if sid.is_tagged() { + caller_state.set_id(sid); + if sid.is_start() { + if let Some(ref mut pre) = pre { + if pre.is_effective(at) { + match pre.next_candidate(bytes, at).into_option() { + None => return Ok(None), + Some(i) => { + at = i; + } + } + } + } + } else if sid.is_match() { + let offset = at - MATCH_OFFSET; + caller_state + .set_last_match(StateMatch { match_index: 1, offset }); + return Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset, + })); + } else if sid.is_dead() { + return Ok(None); + } else { + debug_assert!(sid.is_quit()); + return Err(MatchError::Quit { byte, offset: at - 1 }); + } + } + } + + let result = eoi_fwd(dfa, cache, bytes, end, &mut sid); + caller_state.set_id(sid); + if let Ok(Some(ref last_match)) = result { + caller_state.set_last_match(StateMatch { + // '1' is always correct here since if we get to this point, this + // always corresponds to the first (index '0') match discovered at + // this position. So the next match to report at this position (if + // it exists) is at index '1'. + match_index: 1, + offset: last_match.offset(), + }); + } + result +} + +#[inline(always)] +fn init_fwd( + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<LazyStateID, MatchError> { + let sid = dfa + .start_state_forward(cache, pattern_id, bytes, start, end) + .map_err(|_| gave_up(start))?; + // Start states can never be match states, since all matches are delayed + // by 1 byte. + assert!(!sid.is_match()); + Ok(sid) +} + +#[inline(always)] +fn init_rev( + dfa: &DFA, + cache: &mut Cache, + pattern_id: Option<PatternID>, + bytes: &[u8], + start: usize, + end: usize, +) -> Result<LazyStateID, MatchError> { + let sid = dfa + .start_state_reverse(cache, pattern_id, bytes, start, end) + .map_err(|_| gave_up(end))?; + // Start states can never be match states, since all matches are delayed + // by 1 byte. + assert!(!sid.is_match()); + Ok(sid) +} + +#[inline(always)] +fn eoi_fwd( + dfa: &DFA, + cache: &mut Cache, + bytes: &[u8], + end: usize, + sid: &mut LazyStateID, +) -> Result<Option<HalfMatch>, MatchError> { + match bytes.get(end) { + Some(&b) => { + *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?; + if sid.is_match() { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, *sid, 0), + offset: end, + })) + } else { + Ok(None) + } + } + None => { + *sid = dfa + .next_eoi_state(cache, *sid) + .map_err(|_| gave_up(bytes.len()))?; + if sid.is_match() { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, *sid, 0), + offset: bytes.len(), + })) + } else { + Ok(None) + } + } + } +} + +#[inline(always)] +fn eoi_rev( + dfa: &DFA, + cache: &mut Cache, + bytes: &[u8], + start: usize, + state: LazyStateID, +) -> Result<Option<HalfMatch>, MatchError> { + if start > 0 { + let sid = dfa + .next_state(cache, state, bytes[start - 1]) + .map_err(|_| gave_up(start))?; + if sid.is_match() { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset: start, + })) + } else { + Ok(None) + } + } else { + let sid = + dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?; + if sid.is_match() { + Ok(Some(HalfMatch { + pattern: dfa.match_pattern(cache, sid, 0), + offset: 0, + })) + } else { + Ok(None) + } + } +} + +/// A convenience routine for constructing a "gave up" match error. +#[inline(always)] +fn gave_up(offset: usize) -> MatchError { + MatchError::GaveUp { offset } +} |