diff options
Diffstat (limited to 'vendor/regex-automata/src/hybrid/dfa.rs')
-rw-r--r-- | vendor/regex-automata/src/hybrid/dfa.rs | 2664 |
1 files changed, 1614 insertions, 1050 deletions
diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs index 1fbce5f5f..67261c1a3 100644 --- a/vendor/regex-automata/src/hybrid/dfa.rs +++ b/vendor/regex-automata/src/hybrid/dfa.rs @@ -7,36 +7,47 @@ This module also contains a [`hybrid::dfa::Builder`](Builder) and a [`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA. */ -use core::{borrow::Borrow, iter, mem::size_of}; +use core::{iter, mem::size_of}; -use alloc::{sync::Arc, vec::Vec}; +use alloc::vec::Vec; use crate::{ hybrid::{ error::{BuildError, CacheError}, - id::{LazyStateID, LazyStateIDError, OverlappingState}, + id::{LazyStateID, LazyStateIDError}, search, }, nfa::thompson, util::{ alphabet::{self, ByteClasses, ByteSet}, determinize::{self, State, StateBuilderEmpty, StateBuilderNFA}, - id::{PatternID, StateID as NFAStateID}, - matchtypes::{HalfMatch, MatchError, MatchKind}, - prefilter, + empty, + prefilter::Prefilter, + primitives::{PatternID, StateID as NFAStateID}, + search::{ + Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, + }, sparse_set::SparseSets, - start::Start, + start::{Start, StartByteMap}, }, }; -/// The mininum number of states that a lazy DFA's cache size must support. +/// The minimum number of states that a lazy DFA's cache size must support. /// /// This is checked at time of construction to ensure that at least some small /// number of states can fit in the given capacity allotment. If we can't fit /// at least this number of states, then the thinking is that it's pretty /// senseless to use the lazy DFA. More to the point, parts of the code do /// assume that the cache can fit at least some small number of states. -const MIN_STATES: usize = 5; +const MIN_STATES: usize = SENTINEL_STATES + 2; + +/// The number of "sentinel" states that get added to every lazy DFA. +/// +/// These are special states indicating status conditions of a search: unknown, +/// dead and quit. These states in particular also use zero NFA states, so +/// their memory usage is quite small. This is relevant for computing the +/// minimum memory needed for a lazy DFA cache. +const SENTINEL_STATES: usize = 3; /// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching. /// @@ -92,26 +103,26 @@ const MIN_STATES: usize = 5; /// a cache and pass it to our search routine. /// /// ``` -/// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; +/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; /// let mut cache = dfa.create_cache(); /// /// let expected = Some(HalfMatch::must(0, 8)); -/// assert_eq!(expected, dfa.find_leftmost_fwd(&mut cache, b"foo12345")?); +/// assert_eq!(expected, dfa.try_search_fwd( +/// &mut cache, &Input::new("foo12345"))?, +/// ); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[derive(Clone, Debug)] pub struct DFA { - nfa: Arc<thompson::NFA>, + config: Config, + nfa: thompson::NFA, stride2: usize, + start_map: StartByteMap, classes: ByteClasses, quitset: ByteSet, - anchored: bool, - match_kind: MatchKind, - starts_for_each_pattern: bool, cache_capacity: usize, - minimum_cache_clear_count: Option<usize>, } impl DFA { @@ -124,7 +135,7 @@ impl DFA { /// # Example /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+bar")?; /// let mut cache = dfa.create_cache(); @@ -132,10 +143,11 @@ impl DFA { /// let expected = HalfMatch::must(0, 11); /// assert_eq!( /// Some(expected), - /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?, + /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?, /// ); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` + #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result<DFA, BuildError> { DFA::builder().build(pattern) } @@ -149,7 +161,7 @@ impl DFA { /// # Example /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?; /// let mut cache = dfa.create_cache(); @@ -157,10 +169,11 @@ impl DFA { /// let expected = HalfMatch::must(1, 3); /// assert_eq!( /// Some(expected), - /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?, + /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?, /// ); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` + #[cfg(feature = "syntax")] pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> { DFA::builder().build_many(patterns) } @@ -170,19 +183,23 @@ impl DFA { /// # Example /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::always_match()?; /// let mut cache = dfa.create_cache(); /// /// let expected = HalfMatch::must(0, 0); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"")?); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"foo")?); + /// assert_eq!(Some(expected), dfa.try_search_fwd( + /// &mut cache, &Input::new(""))?, + /// ); + /// assert_eq!(Some(expected), dfa.try_search_fwd( + /// &mut cache, &Input::new("foo"))?, + /// ); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn always_match() -> Result<DFA, BuildError> { let nfa = thompson::NFA::always_match(); - Builder::new().build_from_nfa(Arc::new(nfa)) + Builder::new().build_from_nfa(nfa) } /// Create a new lazy DFA that never matches any input. @@ -190,44 +207,51 @@ impl DFA { /// # Example /// /// ``` - /// use regex_automata::hybrid::dfa::DFA; + /// use regex_automata::{hybrid::dfa::DFA, Input}; /// /// let dfa = DFA::never_match()?; /// let mut cache = dfa.create_cache(); /// - /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"")?); - /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"foo")?); + /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?); + /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn never_match() -> Result<DFA, BuildError> { let nfa = thompson::NFA::never_match(); - Builder::new().build_from_nfa(Arc::new(nfa)) + Builder::new().build_from_nfa(nfa) } /// Return a default configuration for a `DFA`. /// - /// This is a convenience routine to avoid needing to import the `Config` + /// This is a convenience routine to avoid needing to import the [`Config`] /// type when customizing the construction of a lazy DFA. /// /// # Example /// - /// This example shows how to build a lazy DFA that only executes searches - /// in anchored mode. + /// This example shows how to build a lazy DFA that heuristically supports + /// Unicode word boundaries. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input}; /// /// let re = DFA::builder() - /// .configure(DFA::config().anchored(true)) - /// .build(r"[0-9]+")?; + /// .configure(DFA::config().unicode_word_boundary(true)) + /// .build(r"\b\w+\b")?; /// let mut cache = re.create_cache(); /// - /// let haystack = "abc123xyz".as_bytes(); - /// assert_eq!(None, re.find_leftmost_fwd(&mut cache, haystack)?); - /// assert_eq!( - /// Some(HalfMatch::must(0, 3)), - /// re.find_leftmost_fwd(&mut cache, &haystack[3..6])?, - /// ); + /// // Since our haystack is all ASCII, the DFA search sees then and knows + /// // it is legal to interpret Unicode word boundaries as ASCII word + /// // boundaries. + /// let input = Input::new("!!foo!!"); + /// let expected = HalfMatch::must(0, 5); + /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?); + /// + /// // But if our haystack contains non-ASCII, then the search will fail + /// // with an error. + /// let input = Input::new("!!βββ!!"); + /// let expected = MatchError::quit(b'\xCE', 2); + /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input)); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` @@ -243,29 +267,20 @@ impl DFA { /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode - /// everywhere for lazy DFAs. This includes disabling it for both the - /// concrete syntax (e.g., `.` matches any byte and Unicode character - /// classes like `\p{Letter}` are not allowed) and for the unanchored - /// search prefix. The latter enables the regex to match anywhere in a - /// sequence of arbitrary bytes. (Typically, the unanchored search prefix - /// will only permit matching valid UTF-8.) + /// everywhere for lazy DFAs. /// /// ``` - /// use regex_automata::{ - /// hybrid::dfa::DFA, - /// nfa::thompson, - /// HalfMatch, SyntaxConfig, - /// }; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input}; /// /// let re = DFA::builder() - /// .syntax(SyntaxConfig::new().utf8(false)) - /// .thompson(thompson::Config::new().utf8(false)) + /// .syntax(syntax::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// - /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"); /// let expected = Some(HalfMatch::must(0, 9)); - /// let got = re.find_leftmost_fwd(&mut cache, haystack)?; + /// let got = re.try_search_fwd(&mut cache, &input)?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -302,7 +317,8 @@ impl DFA { /// This shows how to re-purpose a cache for use with a different DFA. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa1 = DFA::new(r"\w")?; /// let dfa2 = DFA::new(r"\W")?; @@ -310,7 +326,7 @@ impl DFA { /// let mut cache = dfa1.create_cache(); /// assert_eq!( /// Some(HalfMatch::must(0, 2)), - /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?, + /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?, /// ); /// /// // Using 'cache' with dfa2 is not allowed. It may result in panics or @@ -322,7 +338,7 @@ impl DFA { /// dfa2.reset_cache(&mut cache); /// assert_eq!( /// Some(HalfMatch::must(0, 3)), - /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?, + /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?, /// ); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -337,13 +353,13 @@ impl DFA { /// /// # Example /// - /// This example shows the pattern count for a DFA that never matches: + /// This example shows the pattern length for a DFA that never matches: /// /// ``` /// use regex_automata::hybrid::dfa::DFA; /// /// let dfa = DFA::never_match()?; - /// assert_eq!(dfa.pattern_count(), 0); + /// assert_eq!(dfa.pattern_len(), 0); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// @@ -353,7 +369,7 @@ impl DFA { /// use regex_automata::hybrid::dfa::DFA; /// /// let dfa = DFA::always_match()?; - /// assert_eq!(dfa.pattern_count(), 1); + /// assert_eq!(dfa.pattern_len(), 1); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// @@ -363,15 +379,37 @@ impl DFA { /// use regex_automata::hybrid::dfa::DFA; /// /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; - /// assert_eq!(dfa.pattern_count(), 3); + /// assert_eq!(dfa.pattern_len(), 3); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn pattern_count(&self) -> usize { + pub fn pattern_len(&self) -> usize { self.nfa.pattern_len() } + /// Returns the equivalence classes that make up the alphabet for this DFA. + /// + /// Unless [`Config::byte_classes`] was disabled, it is possible that + /// multiple distinct bytes are grouped into the same equivalence class + /// if it is impossible for them to discriminate between a match and a + /// non-match. This has the effect of reducing the overall alphabet size + /// and in turn potentially substantially reducing the size of the DFA's + /// transition table. + /// + /// The downside of using equivalence classes like this is that every state + /// transition will automatically use this map to convert an arbitrary + /// byte to its corresponding equivalence class. In practice this has a + /// negligible impact on performance. + pub fn byte_classes(&self) -> &ByteClasses { + &self.classes + } + + /// Returns this lazy DFA's configuration. + pub fn get_config(&self) -> &Config { + &self.config + } + /// Returns a reference to the underlying NFA. - pub fn nfa(&self) -> &Arc<thompson::NFA> { + pub fn get_nfa(&self) -> &thompson::NFA { &self.nfa } @@ -393,253 +431,231 @@ impl DFA { 1 << self.stride2() } - /// Returns the total number of elements in the alphabet for this - /// transition table. This is always less than or equal to `self.stride()`. - /// It is only equal when the alphabet length is a power of 2. Otherwise, - /// it is always strictly less. - fn alphabet_len(&self) -> usize { - self.classes.alphabet_len() - } - /// Returns the memory usage, in bytes, of this lazy DFA. /// /// This does **not** include the stack size used up by this lazy DFA. To - /// compute that, use `std::mem::size_of::<DFA>()`. This also does - /// not include the size of the `Cache` used. + /// compute that, use `std::mem::size_of::<DFA>()`. This also does not + /// include the size of the `Cache` used. + /// + /// This also does not include any heap memory used by the NFA inside of + /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and + /// thus not owned by this hybrid NFA/DFA. More practically, several regex + /// engines in this crate embed an NFA, and reporting the NFA's memory + /// usage in all of them would likely result in reporting higher heap + /// memory than is actually used. pub fn memory_usage(&self) -> usize { - // Everything else is on the stack. - self.nfa.memory_usage() + // The only thing that uses heap memory in a DFA is the NFA. But the + // NFA has shared ownership, so reporting its memory as part of the + // hybrid DFA is likely to lead to double-counting the NFA memory + // somehow. In particular, this DFA does not really own an NFA, so + // including it in the DFA's memory usage doesn't seem semantically + // correct. + 0 } } impl DFA { - /// Executes a forward search and returns the end position of the first - /// match that is found as early as possible. If no match exists, then - /// `None` is returned. - /// - /// This routine stops scanning input as soon as the search observes a - /// match state. This is useful for implementing boolean `is_match`-like - /// routines, where as little work is done as possible. + /// Executes a forward search and returns the end position of the leftmost + /// match that is found. If no match exists, then `None` is returned. /// - /// See [`DFA::find_earliest_fwd_at`] for additional functionality, such as - /// providing a prefilter, a specific pattern to match and the bounds of - /// the search within the haystack. This routine is meant as a convenience - /// for common cases where the additional functionality is not needed. + /// In particular, this method continues searching even after it enters + /// a match state. The search only terminates once it has reached the + /// end of the input or when it has entered a dead or quit state. Upon + /// termination, the position of the last byte seen while still in a match + /// state is returned. /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// - /// This example demonstrates how the position returned might differ from - /// what one might expect when executing a traditional leftmost search. + /// This example shows how to run a basic search. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; /// let mut cache = dfa.create_cache(); - /// // Normally, the end of the leftmost first match here would be 8, - /// // corresponding to the end of the input. But the "earliest" semantics - /// // this routine cause it to stop as soon as a match is known, which - /// // occurs once 'foo[0-9]' has matched. - /// let expected = HalfMatch::must(0, 4); - /// assert_eq!( - /// Some(expected), - /// dfa.find_earliest_fwd(&mut cache, b"foo12345")?, + /// let expected = HalfMatch::must(0, 8); + /// assert_eq!(Some(expected), dfa.try_search_fwd( + /// &mut cache, &Input::new("foo12345"))?, /// ); /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over later parts. /// let dfa = DFA::new("abc|a")?; /// let mut cache = dfa.create_cache(); - /// // Normally, the end of the leftmost first match here would be 3, - /// // but the shortest match semantics detect a match earlier. - /// let expected = HalfMatch::must(0, 1); - /// assert_eq!(Some(expected), dfa.find_earliest_fwd(&mut cache, b"abc")?); + /// let expected = HalfMatch::must(0, 3); + /// assert_eq!(Some(expected), dfa.try_search_fwd( + /// &mut cache, &Input::new("abc"))?, + /// ); + /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - #[inline] - pub fn find_earliest_fwd( - &self, - cache: &mut Cache, - bytes: &[u8], - ) -> Result<Option<HalfMatch>, MatchError> { - self.find_earliest_fwd_at(cache, None, None, bytes, 0, bytes.len()) - } - - /// Executes a reverse search and returns the start position of the first - /// match that is found as early as possible. If no match exists, then - /// `None` is returned. - /// - /// This routine stops scanning input as soon as the search observes a - /// match state. - /// - /// Note that while it is not technically necessary to build a reverse - /// automaton to use a reverse search, it is likely that you'll want to do - /// so. Namely, the typical use of a reverse search is to find the starting - /// location of a match once its end is discovered from a forward search. A - /// reverse DFA automaton can be built by configuring the intermediate NFA - /// to be reversed via - /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse). - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// # Example + /// # Example: specific pattern search /// - /// This example demonstrates how the position returned might differ from - /// what one might expect when executing a traditional leftmost reverse - /// search. + /// This example shows how to build a lazy multi-DFA that permits searching + /// for specific patterns. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch}; - /// - /// let dfa = DFA::builder() - /// .thompson(thompson::Config::new().reverse(true)) - /// .build("[a-z]+[0-9]+")?; - /// let mut cache = dfa.create_cache(); - /// // Normally, the end of the leftmost first match here would be 0, - /// // corresponding to the beginning of the input. But the "earliest" - /// // semantics of this routine cause it to stop as soon as a match is - /// // known, which occurs once '[a-z][0-9]+' has matched. - /// let expected = HalfMatch::must(0, 2); - /// assert_eq!( - /// Some(expected), - /// dfa.find_earliest_rev(&mut cache, b"foo12345")?, - /// ); + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// Anchored, HalfMatch, PatternID, Input, + /// }; /// /// let dfa = DFA::builder() - /// .thompson(thompson::Config::new().reverse(true)) - /// .build("abc|c")?; + /// .configure(DFA::config().starts_for_each_pattern(true)) + /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let mut cache = dfa.create_cache(); - /// // Normally, the end of the leftmost first match here would be 0, - /// // but the shortest match semantics detect a match earlier. - /// let expected = HalfMatch::must(0, 2); - /// assert_eq!(Some(expected), dfa.find_earliest_rev(&mut cache, b"abc")?); - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - #[inline] - pub fn find_earliest_rev( - &self, - cache: &mut Cache, - bytes: &[u8], - ) -> Result<Option<HalfMatch>, MatchError> { - self.find_earliest_rev_at(cache, None, bytes, 0, bytes.len()) - } - - /// Executes a forward search and returns the end position of the leftmost - /// match that is found. If no match exists, then `None` is returned. + /// let haystack = "foo123"; /// - /// In particular, this method continues searching even after it enters - /// a match state. The search only terminates once it has reached the - /// end of the input or when it has entered a dead or quit state. Upon - /// termination, the position of the last byte seen while still in a match - /// state is returned. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(HalfMatch::must(0, 6)); + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; + /// assert_eq!(expected, got); /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(HalfMatch::must(1, 6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// let got = dfa.try_search_fwd(&mut cache, &input)?; + /// assert_eq!(expected, got); /// - /// # Example + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` /// - /// Leftmost first match semantics corresponds to the match with the - /// smallest starting offset, but where the end offset is determined by - /// preferring earlier branches in the original regular expression. For - /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` - /// will match `Samwise` in `Samwise`. + /// # Example: specifying the bounds of a search /// - /// Generally speaking, the "leftmost first" match is how most backtracking - /// regular expressions tend to work. This is in contrast to POSIX-style - /// regular expressions that yield "leftmost longest" matches. Namely, - /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using - /// leftmost longest semantics. (This crate does not currently support - /// leftmost longest semantics.) + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// - /// let dfa = DFA::new("foo[0-9]+")?; + /// // N.B. We disable Unicode here so that we use a simple ASCII word + /// // boundary. Alternatively, we could enable heuristic support for + /// // Unicode word boundaries since our haystack is pure ASCII. + /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?; /// let mut cache = dfa.create_cache(); - /// let expected = HalfMatch::must(0, 8); - /// assert_eq!( - /// Some(expected), - /// dfa.find_leftmost_fwd(&mut cache, b"foo12345")?, - /// ); + /// let haystack = "foo123bar"; /// - /// // Even though a match is found after reading the first byte (`a`), - /// // the leftmost first match semantics demand that we find the earliest - /// // match that prefers earlier parts of the pattern over latter parts. - /// let dfa = DFA::new("abc|a")?; - /// let mut cache = dfa.create_cache(); - /// let expected = HalfMatch::must(0, 3); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"abc")?); + /// // Since we sub-slice the haystack, the search doesn't know about the + /// // larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `3` instead of `6`. + /// let expected = Some(HalfMatch::must(0, 3)); + /// let got = dfa.try_search_fwd( + /// &mut cache, + /// &Input::new(&haystack[3..6]), + /// )?; + /// assert_eq!(expected, got); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let got = dfa.try_search_fwd( + /// &mut cache, + /// &Input::new(haystack).range(3..6), + /// )?; + /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - pub fn find_leftmost_fwd( + pub fn try_search_fwd( &self, cache: &mut Cache, - bytes: &[u8], + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - self.find_leftmost_fwd_at(cache, None, None, bytes, 0, bytes.len()) + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match search::find_fwd(self, cache, input)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + // We get to this point when we know our DFA can match the empty string + // AND when UTF-8 mode is enabled. In this case, we skip any matches + // whose offset splits a codepoint. Such a match is necessarily a + // zero-width match, because UTF-8 mode requires the underlying NFA + // to be built such that all non-empty matches span valid UTF-8. + // Therefore, any match that ends in the middle of a codepoint cannot + // be part of a span of valid UTF-8 and thus must be an empty match. + // In such cases, we skip it, so as not to report matches that split a + // codepoint. + // + // Note that this is not a checked assumption. Callers *can* provide an + // NFA with UTF-8 mode enabled but produces non-empty matches that span + // invalid UTF-8. But doing so is documented to result in unspecified + // behavior. + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + let got = search::find_fwd(self, cache, input)?; + Ok(got.map(|hm| (hm, hm.offset()))) + }) } /// Executes a reverse search and returns the start of the position of the /// leftmost match that is found. If no match exists, then `None` is /// returned. /// - /// In particular, this method continues searching even after it enters - /// a match state. The search only terminates once it has reached the - /// end of the input or when it has entered a dead or quit state. Upon - /// termination, the position of the last byte seen while still in a match - /// state is returned. - /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// - /// In particular, this routine is principally - /// useful when used in conjunction with the - /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::revers - /// e) configuration. In general, it's unlikely to be correct to use both - /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since - /// any particular DFA will only support searching in one direction with + /// This routine is principally useful when used in + /// conjunction with the + /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) + /// configuration. In general, it's unlikely to be correct to use both + /// `try_search_fwd` and `try_search_rev` with the same DFA since any + /// particular DFA will only support searching in one direction with /// respect to the pattern. /// /// ``` - /// use regex_automata::{nfa::thompson, hybrid::dfa::DFA, HalfMatch}; + /// use regex_automata::{ + /// nfa::thompson, + /// hybrid::dfa::DFA, + /// HalfMatch, Input, + /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) @@ -648,7 +664,7 @@ impl DFA { /// let expected = HalfMatch::must(0, 0); /// assert_eq!( /// Some(expected), - /// dfa.find_leftmost_rev(&mut cache, b"foo12345")?, + /// dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?, /// ); /// /// // Even though a match is found after reading the last byte (`c`), @@ -659,17 +675,133 @@ impl DFA { /// .build("abc|c")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 0); - /// assert_eq!(Some(expected), dfa.find_leftmost_rev(&mut cache, b"abc")?); + /// assert_eq!(Some(expected), dfa.try_search_rev( + /// &mut cache, &Input::new("abc"))?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: UTF-8 mode + /// + /// This examples demonstrates that UTF-8 mode applies to reverse + /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all + /// matches reported must correspond to valid UTF-8 spans. This includes + /// prohibiting zero-width matches that split a codepoint. + /// + /// UTF-8 mode is enabled by default. Notice below how the only zero-width + /// matches reported are those at UTF-8 boundaries: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true)) + /// .build(r"")?; + /// let mut cache = dfa.create_cache(); + /// + /// // Run the reverse DFA to collect all matches. + /// let mut input = Input::new("☃"); + /// let mut matches = vec![]; + /// loop { + /// match dfa.try_search_rev(&mut cache, &input)? { + /// None => break, + /// Some(hm) => { + /// matches.push(hm); + /// if hm.offset() == 0 || input.end() == 0 { + /// break; + /// } else if hm.offset() < input.end() { + /// input.set_end(hm.offset()); + /// } else { + /// // This is only necessary to handle zero-width + /// // matches, which of course occur in this example. + /// // Without this, the search would never advance + /// // backwards beyond the initial match. + /// input.set_end(input.end() - 1); + /// } + /// } + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Now let's look at the same example, but with UTF-8 mode on the + /// underlying NFA disabled: + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, + /// }; + /// + /// let dfa = DFA::builder() + /// .thompson(thompson::Config::new().reverse(true).utf8(false)) + /// .build(r"")?; + /// let mut cache = dfa.create_cache(); + /// + /// // Run the reverse DFA to collect all matches. + /// let mut input = Input::new("☃"); + /// let mut matches = vec![]; + /// loop { + /// match dfa.try_search_rev(&mut cache, &input)? { + /// None => break, + /// Some(hm) => { + /// matches.push(hm); + /// if hm.offset() == 0 || input.end() == 0 { + /// break; + /// } else if hm.offset() < input.end() { + /// input.set_end(hm.offset()); + /// } else { + /// // This is only necessary to handle zero-width + /// // matches, which of course occur in this example. + /// // Without this, the search would never advance + /// // backwards beyond the initial match. + /// input.set_end(input.end() - 1); + /// } + /// } + /// } + /// } + /// + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 2), + /// HalfMatch::must(0, 1), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - pub fn find_leftmost_rev( + pub fn try_search_rev( &self, cache: &mut Cache, - bytes: &[u8], + input: &Input<'_>, ) -> Result<Option<HalfMatch>, MatchError> { - self.find_leftmost_rev_at(cache, None, bytes, 0, bytes.len()) + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match search::find_rev(self, cache, input)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + empty::skip_splits_rev(input, hm, hm.offset(), |input| { + let got = search::find_rev(self, cache, input)?; + Ok(got.map(|hm| (hm, hm.offset()))) + }) } /// Executes an overlapping forward search and returns the end position of @@ -681,15 +813,34 @@ impl DFA { /// state from prior calls so that the implementation knows where the last /// match occurred. /// - /// # Errors + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should remain invariant throughout + /// iteration. The `OverlappingState` given to the search will keep track + /// of the current position of the search. (This is because multiple + /// matches may be reported at the same position, so only the search + /// implementation itself knows when to advance the position.) + /// + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. + /// # Errors /// - /// When a search cannot complete, callers cannot know whether a match + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example @@ -708,10 +859,10 @@ impl DFA { /// the search to find totally new matches (potentially of other patterns). /// /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ - /// hybrid::{dfa::DFA, OverlappingState}, - /// HalfMatch, - /// MatchKind, + /// hybrid::dfa::{DFA, OverlappingState}, + /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() @@ -719,12 +870,14 @@ impl DFA { /// .build_many(&[r"\w+$", r"\S+$"])?; /// let mut cache = dfa.create_cache(); /// - /// let haystack = "@foo".as_bytes(); + /// let haystack = "@foo"; /// let mut state = OverlappingState::start(); /// /// let expected = Some(HalfMatch::must(1, 4)); - /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; - /// assert_eq!(expected, got); + /// dfa.try_search_overlapping_fwd( + /// &mut cache, &Input::new(haystack), &mut state, + /// )?; + /// assert_eq!(expected, state.get_match()); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first @@ -732,418 +885,265 @@ impl DFA { /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(HalfMatch::must(0, 4)); - /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; - /// assert_eq!(expected, got); + /// dfa.try_search_overlapping_fwd( + /// &mut cache, &Input::new(haystack), &mut state, + /// )?; + /// assert_eq!(expected, state.get_match()); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - pub fn find_overlapping_fwd( + pub fn try_search_overlapping_fwd( &self, cache: &mut Cache, - bytes: &[u8], + input: &Input<'_>, state: &mut OverlappingState, - ) -> Result<Option<HalfMatch>, MatchError> { - self.find_overlapping_fwd_at( - cache, - None, - None, - bytes, - 0, - bytes.len(), - state, - ) + ) -> Result<(), MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + search::find_overlapping_fwd(self, cache, input, state)?; + match state.get_match() { + None => Ok(()), + Some(_) if !utf8empty => Ok(()), + Some(_) => skip_empty_utf8_splits_overlapping( + input, + state, + |input, state| { + search::find_overlapping_fwd(self, cache, input, state) + }, + ), + } } - /// Executes a forward search and returns the end position of the first - /// match that is found as early as possible. If no match exists, then + /// Executes a reverse overlapping search and returns the start of the + /// position of the leftmost match that is found. If no match exists, then /// `None` is returned. /// - /// This routine stops scanning input as soon as the search observes a - /// match state. This is useful for implementing boolean `is_match`-like - /// routines, where as little work is done as possible. - /// - /// This is like [`DFA::find_earliest_fwd`], except it provides some - /// additional control over how the search is executed: - /// - /// * `pre` is a prefilter scanner that, when given, is used whenever the - /// DFA enters its starting state. This is meant to speed up searches where - /// one or a small number of literal prefixes are known. - /// * `pattern_id` specifies a specific pattern in the DFA to run an - /// anchored search for. If not given, then a search for any pattern is - /// performed. For lazy DFAs, [`Config::starts_for_each_pattern`] must be - /// enabled to use this functionality. - /// * `start` and `end` permit searching a specific region of the haystack - /// `bytes`. This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `bytes`. (Because the existence of look-around - /// operations such as `\b`, `^` and `$` need to take the surrounding - /// context into account. This cannot be done if the haystack doesn't - /// contain it.) - /// - /// The examples below demonstrate each of these additional parameters. + /// When using this routine to implement an iterator of overlapping + /// matches, the `start` of the search should remain invariant throughout + /// iteration. The `OverlappingState` given to the search will keep track + /// of the current position of the search. (This is because multiple + /// matches may be reported at the same position, so only the search + /// implementation itself knows when to advance the position.) /// - /// # Errors + /// If for some reason you want the search to forget about its previous + /// state and restart the search at a particular position, then setting the + /// state to [`OverlappingState::start`] will accomplish that. /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. + /// # Errors /// - /// When a search cannot complete, callers cannot know whether a match + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match /// exists or not. /// - /// # Panics - /// - /// This routine panics if a `pattern_id` is given and this lazy DFA does - /// not support specific pattern searches. + /// # Example: UTF-8 mode /// - /// It also panics if the given haystack range is not valid. + /// This examples demonstrates that UTF-8 mode applies to reverse + /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all + /// matches reported must correspond to valid UTF-8 spans. This includes + /// prohibiting zero-width matches that split a codepoint. /// - /// # Example: prefilter - /// - /// This example shows how to provide a prefilter for a pattern where all - /// matches start with a `z` byte. + /// UTF-8 mode is enabled by default. Notice below how the only zero-width + /// matches reported are those at UTF-8 boundaries: /// /// ``` /// use regex_automata::{ - /// hybrid::dfa::DFA, - /// util::prefilter::{Candidate, Prefilter, Scanner, State}, - /// HalfMatch, + /// hybrid::dfa::{DFA, OverlappingState}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, /// }; /// - /// #[derive(Debug)] - /// pub struct ZPrefilter; - /// - /// impl Prefilter for ZPrefilter { - /// fn next_candidate( - /// &self, - /// _: &mut State, - /// haystack: &[u8], - /// at: usize, - /// ) -> Candidate { - /// // Try changing b'z' to b'q' and observe this test fail since - /// // the prefilter will skip right over the match. - /// match haystack.iter().position(|&b| b == b'z') { - /// None => Candidate::None, - /// Some(i) => Candidate::PossibleStartOfMatch(at + i), - /// } - /// } + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true)) + /// .build_many(&[r"", r"☃"])?; + /// let mut cache = dfa.create_cache(); /// - /// fn heap_bytes(&self) -> usize { - /// 0 + /// // Run the reverse DFA to collect all matches. + /// let input = Input::new("☃"); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// loop { + /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; + /// match state.get_match() { + /// None => break, + /// Some(hm) => matches.push(hm), /// } /// } /// - /// let dfa = DFA::new("z[0-9]{3}")?; - /// let mut cache = dfa.create_cache(); - /// - /// let haystack = "foobar z123 q123".as_bytes(); - /// // A scanner executes a prefilter while tracking some state that helps - /// // determine whether a prefilter is still "effective" or not. - /// let mut scanner = Scanner::new(&ZPrefilter); - /// - /// let expected = Some(HalfMatch::must(0, 11)); - /// let got = dfa.find_earliest_fwd_at( - /// &mut cache, - /// Some(&mut scanner), - /// None, - /// haystack, - /// 0, - /// haystack.len(), - /// )?; - /// assert_eq!(expected, got); + /// // No matches split a codepoint. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(1, 0), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// - /// # Example: specific pattern search - /// - /// This example shows how to build a lazy multi-DFA that permits searching - /// for specific patterns. + /// Now let's look at the same example, but with UTF-8 mode on the + /// underlying NFA disabled: /// /// ``` /// use regex_automata::{ - /// hybrid::dfa::DFA, - /// HalfMatch, - /// PatternID, + /// hybrid::dfa::{DFA, OverlappingState}, + /// nfa::thompson, + /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() - /// .configure(DFA::config().starts_for_each_pattern(true)) - /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; - /// let mut cache = dfa.create_cache(); - /// let haystack = "foo123".as_bytes(); - /// - /// // Since we are using the default leftmost-first match and both - /// // patterns match at the same starting position, only the first pattern - /// // will be returned in this case when doing a search for any of the - /// // patterns. - /// let expected = Some(HalfMatch::must(0, 6)); - /// let got = dfa.find_earliest_fwd_at( - /// &mut cache, - /// None, - /// None, - /// haystack, - /// 0, - /// haystack.len(), - /// )?; - /// assert_eq!(expected, got); - /// - /// // But if we want to check whether some other pattern matches, then we - /// // can provide its pattern ID. - /// let expected = Some(HalfMatch::must(1, 6)); - /// let got = dfa.find_earliest_fwd_at( - /// &mut cache, - /// None, - /// Some(PatternID::must(1)), - /// haystack, - /// 0, - /// haystack.len(), - /// )?; - /// assert_eq!(expected, got); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - /// - /// # Example: specifying the bounds of a search - /// - /// This example shows how providing the bounds of a search can produce - /// different results than simply sub-slicing the haystack. - /// - /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; - /// - /// // N.B. We disable Unicode here so that we use a simple ASCII word - /// // boundary. Alternatively, we could enable heuristic support for - /// // Unicode word boundaries since our haystack is pure ASCII. - /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?; + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true).utf8(false)) + /// .build_many(&[r"", r"☃"])?; /// let mut cache = dfa.create_cache(); - /// let haystack = "foo123bar".as_bytes(); /// - /// // Since we sub-slice the haystack, the search doesn't know about the - /// // larger context and assumes that `123` is surrounded by word - /// // boundaries. And of course, the match position is reported relative - /// // to the sub-slice as well, which means we get `3` instead of `6`. - /// let expected = Some(HalfMatch::must(0, 3)); - /// let got = dfa.find_earliest_fwd_at( - /// &mut cache, - /// None, - /// None, - /// &haystack[3..6], - /// 0, - /// haystack[3..6].len(), - /// )?; - /// assert_eq!(expected, got); + /// // Run the reverse DFA to collect all matches. + /// let input = Input::new("☃"); + /// let mut state = OverlappingState::start(); + /// let mut matches = vec![]; + /// loop { + /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; + /// match state.get_match() { + /// None => break, + /// Some(hm) => matches.push(hm), + /// } + /// } /// - /// // But if we provide the bounds of the search within the context of the - /// // entire haystack, then the search can take the surrounding context - /// // into account. (And if we did find a match, it would be reported - /// // as a valid offset into `haystack` instead of its sub-slice.) - /// let expected = None; - /// let got = dfa.find_earliest_fwd_at( - /// &mut cache, - /// None, - /// None, - /// haystack, - /// 3, - /// 6, - /// )?; - /// assert_eq!(expected, got); + /// // Now *all* positions match, even within a codepoint, + /// // because we lifted the requirement that matches + /// // correspond to valid UTF-8 spans. + /// let expected = vec![ + /// HalfMatch::must(0, 3), + /// HalfMatch::must(0, 2), + /// HalfMatch::must(0, 1), + /// HalfMatch::must(1, 0), + /// HalfMatch::must(0, 0), + /// ]; + /// assert_eq!(expected, matches); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - pub fn find_earliest_fwd_at( + pub fn try_search_overlapping_rev( &self, cache: &mut Cache, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_earliest_fwd( - pre, self, cache, pattern_id, bytes, start, end, - ) - } - - /// Executes a reverse search and returns the start position of the first - /// match that is found as early as possible. If no match exists, then - /// `None` is returned. - /// - /// This routine stops scanning input as soon as the search observes a - /// match state. - /// - /// This is like [`DFA::find_earliest_rev`], except it provides some - /// additional control over how the search is executed. See the - /// documentation of [`DFA::find_earliest_fwd_at`] for more details - /// on the additional parameters along with examples of their usage. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// # Panics - /// - /// This routine panics if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. - /// - /// It also panics if the given haystack range is not valid. - #[inline] - pub fn find_earliest_rev_at( - &self, - cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_earliest_rev(self, cache, pattern_id, bytes, start, end) + input: &Input<'_>, + state: &mut OverlappingState, + ) -> Result<(), MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + search::find_overlapping_rev(self, cache, input, state)?; + match state.get_match() { + None => Ok(()), + Some(_) if !utf8empty => Ok(()), + Some(_) => skip_empty_utf8_splits_overlapping( + input, + state, + |input, state| { + search::find_overlapping_rev(self, cache, input, state) + }, + ), + } } - /// Executes a forward search and returns the end position of the leftmost - /// match that is found. If no match exists, then `None` is returned. - /// - /// This is like [`DFA::find_leftmost_fwd`], except it provides some - /// additional control over how the search is executed. See the - /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the - /// additional parameters along with examples of their usage. - /// - /// # Errors + /// Writes the set of patterns that match anywhere in the given search + /// configuration to `patset`. If multiple patterns match at the same + /// position and the underlying DFA supports overlapping matches, then all + /// matching patterns are written to the given set. /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. + /// Unless all of the patterns in this DFA are anchored, then generally + /// speaking, this will visit every byte in the haystack. /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. + /// This search routine *does not* clear the pattern set. This gives some + /// flexibility to the caller (e.g., running multiple searches with the + /// same pattern set), but does make the API bug-prone if you're reusing + /// the same pattern set for multiple searches but intended them to be + /// independent. /// - /// # Panics - /// - /// This routine panics if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. - /// - /// It also panics if the given haystack range is not valid. - #[inline] - pub fn find_leftmost_fwd_at( - &self, - cache: &mut Cache, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_leftmost_fwd( - pre, self, cache, pattern_id, bytes, start, end, - ) - } - - /// Executes a reverse search and returns the start of the position of the - /// leftmost match that is found. If no match exists, then `None` is - /// returned. - /// - /// This is like [`DFA::find_leftmost_rev`], except it provides some - /// additional control over how the search is executed. See the - /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the - /// additional parameters along with examples of their usage. + /// If a pattern ID matched but the given `PatternSet` does not have + /// sufficient capacity to store it, then it is not inserted and silently + /// dropped. /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match /// exists or not. /// - /// # Panics - /// - /// This routine panics if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. - /// - /// It also panics if the given haystack range is not valid. - #[inline] - pub fn find_leftmost_rev_at( - &self, - cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_leftmost_rev(self, cache, pattern_id, bytes, start, end) - } - - /// Executes an overlapping forward search and returns the end position of - /// matches as they are found. If no match exists, then `None` is returned. - /// - /// This routine is principally only useful when searching for multiple - /// patterns on inputs where multiple patterns may match the same regions - /// of text. In particular, callers must preserve the automaton's search - /// state from prior calls so that the implementation knows where the last - /// match occurred. - /// - /// This is like [`DFA::find_overlapping_fwd`], except it provides - /// some additional control over how the search is executed. See the - /// documentation of [`DFA::find_earliest_fwd_at`] for more details - /// on the additional parameters along with examples of their usage. - /// - /// When using this routine to implement an iterator of overlapping - /// matches, the `start` of the search should always be set to the end - /// of the last match. If more patterns match at the previous location, - /// then they will be immediately returned. (This is tracked by the given - /// overlapping state.) Otherwise, the search continues at the starting - /// position given. - /// - /// If for some reason you want the search to forget about its previous - /// state and restart the search at a particular position, then setting the - /// state to [`OverlappingState::start`] will accomplish that. - /// - /// # Errors + /// # Example /// - /// This routine only errors if the search could not complete. For - /// lazy DFAs generated by this crate, this only occurs in non-default - /// configurations where quit bytes are used, Unicode word boundaries are - /// heuristically enabled or limits are set on the number of times the lazy - /// DFA's cache may be cleared. + /// This example shows how to find all matching patterns in a haystack, + /// even when some patterns match at the same position as other patterns. /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// Input, MatchKind, PatternSet, + /// }; /// - /// # Panics + /// let patterns = &[ + /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", + /// ]; + /// let dfa = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// let mut cache = dfa.create_cache(); /// - /// This routine panics if a `pattern_id` is given and the underlying - /// DFA does not support specific pattern searches. + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(dfa.pattern_len()); + /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?; + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); /// - /// It also panics if the given haystack range is not valid. + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` #[inline] - pub fn find_overlapping_fwd_at( + pub fn try_which_overlapping_matches( &self, cache: &mut Cache, - pre: Option<&mut prefilter::Scanner>, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result<Option<HalfMatch>, MatchError> { - search::find_overlapping_fwd( - pre, self, cache, pattern_id, bytes, start, end, state, - ) + input: &Input<'_>, + patset: &mut PatternSet, + ) -> Result<(), MatchError> { + let mut state = OverlappingState::start(); + while let Some(m) = { + self.try_search_overlapping_fwd(cache, input, &mut state)?; + state.get_match() + } { + let _ = patset.try_insert(m.pattern()); + // There's nothing left to find, so we can stop. Or the caller + // asked us to. + if patset.is_full() || input.get_earliest() { + break; + } + } + Ok(()) } } @@ -1189,7 +1189,7 @@ impl DFA { /// haystack by using the `next_state` method. /// /// ``` - /// use regex_automata::hybrid::dfa::DFA; + /// use regex_automata::{hybrid::dfa::DFA, Input}; /// /// let dfa = DFA::new(r"[a-z]+r")?; /// let mut cache = dfa.create_cache(); @@ -1198,7 +1198,7 @@ impl DFA { /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut sid = dfa.start_state_forward( - /// &mut cache, None, haystack, 0, haystack.len(), + /// &mut cache, &Input::new(haystack), /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { @@ -1289,7 +1289,7 @@ impl DFA { /// haystack by using the `next_state_untagged` method where possible. /// /// ``` - /// use regex_automata::hybrid::dfa::DFA; + /// use regex_automata::{hybrid::dfa::DFA, Input}; /// /// let dfa = DFA::new(r"[a-z]+r")?; /// let mut cache = dfa.create_cache(); @@ -1298,7 +1298,7 @@ impl DFA { /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut sid = dfa.start_state_forward( - /// &mut cache, None, haystack, 0, haystack.len(), + /// &mut cache, &Input::new(haystack), /// )?; /// // Walk all the bytes in the haystack. /// let mut at = 0; @@ -1478,7 +1478,7 @@ impl DFA { /// and then finishing the search with the final EOI transition. /// /// ``` - /// use regex_automata::hybrid::dfa::DFA; + /// use regex_automata::{hybrid::dfa::DFA, Input}; /// /// let dfa = DFA::new(r"[a-z]+r")?; /// let mut cache = dfa.create_cache(); @@ -1487,7 +1487,7 @@ impl DFA { /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut sid = dfa.start_state_forward( - /// &mut cache, None, haystack, 0, haystack.len(), + /// &mut cache, &Input::new(haystack), /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { @@ -1524,44 +1524,43 @@ impl DFA { /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: /// - /// * The pattern ID, if present. When the underlying DFA has been - /// configured with multiple patterns _and_ the DFA has been configured to - /// build an anchored start state for each pattern, then a pattern ID may - /// be specified to execute an anchored search for that specific pattern. - /// If `pattern_id` is invalid or if the DFA isn't configured to build - /// start states for each pattern, then implementations must panic. DFAs in - /// this crate can be configured to build start states for each pattern via - /// [`Config::starts_for_each_pattern`]. - /// * When `start > 0`, the byte at index `start - 1` may influence the - /// start state if the regex uses `^` or `\b`. - /// * Similarly, when `start == 0`, it may influence the start state when - /// the regex uses `^` or `\A`. - /// * Currently, `end` is unused. + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. /// * Whether the search is a forward or reverse search. This routine can /// only be used for forward searches. /// - /// # Panics + /// # Errors /// - /// This panics if `start..end` is not a valid sub-slice of `bytes`. This - /// also panics if `pattern_id` is non-None and does not refer to a valid - /// pattern, or if the DFA was not configured to build anchored start - /// states for each pattern. - #[inline] + /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search + /// needs to give up when determining the start state (for example, if + /// it sees a "quit" byte or if the cache has been cleared too many + /// times). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<LazyStateID, CacheError> { - let mut lazy = Lazy::new(self, cache); - let start_type = Start::from_position_fwd(bytes, start, end); - let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type); - if !sid.is_unknown() { - return Ok(sid); + input: &Input<'_>, + ) -> Result<LazyStateID, MatchError> { + if !self.quitset.is_empty() && input.start() > 0 { + let offset = input.start() - 1; + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } + } + let start_type = self.start_map.fwd(input); + let start = LazyRef::new(self, cache) + .get_cached_start_id(input, start_type)?; + if !start.is_unknown() { + return Ok(start); } - lazy.cache_start_group(pattern_id, start_type) + Lazy::new(self, cache).cache_start_group(input, start_type) } /// Return the ID of the start state for this lazy DFA when executing a @@ -1570,44 +1569,43 @@ impl DFA { /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: /// - /// * The pattern ID, if present. When the underlying DFA has been - /// configured with multiple patterns _and_ the DFA has been configured to - /// build an anchored start state for each pattern, then a pattern ID may - /// be specified to execute an anchored search for that specific pattern. - /// If `pattern_id` is invalid or if the DFA isn't configured to build - /// start states for each pattern, then implementations must panic. DFAs in - /// this crate can be configured to build start states for each pattern via - /// [`Config::starts_for_each_pattern`]. - /// * When `end < bytes.len()`, the byte at index `end` may influence the - /// start state if the regex uses `$` or `\b`. - /// * Similarly, when `end == bytes.len()`, it may influence the start - /// state when the regex uses `$` or `\z`. - /// * Currently, `start` is unused. + /// * The [`Anchored`] mode of the search. Unanchored, anchored and + /// anchored searches for a specific [`PatternID`] all use different start + /// states. + /// * The position at which the search begins, via [`Input::start`]. This + /// and the byte immediately preceding the start of the search (if one + /// exists) influence which look-behind assertions are true at the start + /// of the search. This in turn influences which start state is selected. /// * Whether the search is a forward or reverse search. This routine can /// only be used for reverse searches. /// - /// # Panics + /// # Errors /// - /// This panics if `start..end` is not a valid sub-slice of `bytes`. This - /// also panics if `pattern_id` is non-None and does not refer to a valid - /// pattern, or if the DFA was not configured to build anchored start - /// states for each pattern. - #[inline] + /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search + /// needs to give up when determining the start state (for example, if + /// it sees a "quit" byte or if the cache has been cleared too many + /// times). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. + #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, - pattern_id: Option<PatternID>, - bytes: &[u8], - start: usize, - end: usize, - ) -> Result<LazyStateID, CacheError> { - let mut lazy = Lazy::new(self, cache); - let start_type = Start::from_position_rev(bytes, start, end); - let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type); - if !sid.is_unknown() { - return Ok(sid); + input: &Input<'_>, + ) -> Result<LazyStateID, MatchError> { + if !self.quitset.is_empty() && input.end() < input.haystack().len() { + let offset = input.end(); + let byte = input.haystack()[offset]; + if self.quitset.contains(byte) { + return Err(MatchError::quit(byte, offset)); + } } - lazy.cache_start_group(pattern_id, start_type) + let start_type = self.start_map.rev(input); + let start = LazyRef::new(self, cache) + .get_cached_start_id(input, start_type)?; + if !start.is_unknown() { + return Ok(start); + } + Lazy::new(self, cache).cache_start_group(input, start_type) } /// Returns the total number of patterns that match in this state. @@ -1616,9 +1614,11 @@ impl DFA { /// necessarily always return `1` for all match states. /// /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with - /// indices up to (but not including) the count returned by this routine + /// indices up to (but not including) the length returned by this routine /// without panicking. /// + /// # Panics + /// /// If the given state is not a match state, then this may either panic /// or return an incorrect result. /// @@ -1629,12 +1629,10 @@ impl DFA { /// patterns have matched in a particular state, but also how to access /// which specific patterns have matched. /// - /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All) - /// when building the DFA. If we used - /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst) - /// instead, then the DFA would not be constructed in a way that supports - /// overlapping matches. (It would only report a single pattern that - /// matches at any particular point in time.) + /// Notice that we must use [`MatchKind::All`] when building the DFA. If we + /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be + /// constructed in a way that supports overlapping matches. (It would only + /// report a single pattern that matches at any particular point in time.) /// /// Another thing to take note of is the patterns used and the order in /// which the pattern IDs are reported. In the example below, pattern `3` @@ -1645,7 +1643,8 @@ impl DFA { /// other. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, MatchKind}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind}; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) @@ -1658,7 +1657,7 @@ impl DFA { /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut sid = dfa.start_state_forward( - /// &mut cache, None, haystack, 0, haystack.len(), + /// &mut cache, &Input::new(haystack), /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { @@ -1667,8 +1666,8 @@ impl DFA { /// sid = dfa.next_eoi_state(&mut cache, sid)?; /// /// assert!(sid.is_match()); - /// assert_eq!(dfa.match_count(&mut cache, sid), 3); - /// // The following calls are guaranteed to not panic since `match_count` + /// assert_eq!(dfa.match_len(&mut cache, sid), 3); + /// // The following calls are guaranteed to not panic since `match_len` /// // returned `3` above. /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3); /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0); @@ -1677,22 +1676,22 @@ impl DFA { /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] - pub fn match_count(&self, cache: &Cache, id: LazyStateID) -> usize { + pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize { assert!(id.is_match()); - LazyRef::new(self, cache).get_cached_state(id).match_count() + LazyRef::new(self, cache).get_cached_state(id).match_len() } /// Returns the pattern ID corresponding to the given match index in the /// given state. /// - /// See [`DFA::match_count`] for an example of how to use this method + /// See [`DFA::match_len`] for an example of how to use this method /// correctly. Note that if you know your lazy DFA is configured with a /// single pattern, then this routine is never necessary since it will /// always return a pattern ID of `0` for an index of `0` when `id` /// corresponds to a match state. /// /// Typically, this routine is used when implementing an overlapping - /// search, as the example for `DFA::match_count` does. + /// search, as the example for `DFA::match_len` does. /// /// # Panics /// @@ -1713,7 +1712,7 @@ impl DFA { // that finds the pattern ID from the corresponding `State`, which // requires a bit of slicing/pointer-chasing. This optimization tends // to only matter when matches are frequent. - if self.pattern_count() == 1 { + if self.pattern_len() == 1 { return PatternID::ZERO; } LazyRef::new(self, cache) @@ -1809,6 +1808,25 @@ pub struct Cache { /// clear count is set, then the cache will return an error instead of /// clearing the cache if the count has been exceeded. clear_count: usize, + /// The total number of bytes searched since the last time this cache was + /// cleared, not including the current search. + /// + /// This can be added to the length of the current search to get the true + /// total number of bytes searched. + /// + /// This is generally only non-zero when the + /// `Cache::search_{start,update,finish}` APIs are used to track search + /// progress. + bytes_searched: usize, + /// The progress of the current search. + /// + /// This is only non-`None` when callers utlize the `Cache::search_start`, + /// `Cache::search_update` and `Cache::search_finish` APIs. + /// + /// The purpose of recording search progress is to be able to make a + /// determination about the efficiency of the cache. Namely, by keeping + /// track of the + progress: Option<SearchProgress>, } impl Cache { @@ -1823,14 +1841,18 @@ impl Cache { starts: alloc::vec![], states: alloc::vec![], states_to_id: StateMap::new(), - sparses: SparseSets::new(dfa.nfa.len()), + sparses: SparseSets::new(dfa.get_nfa().states().len()), stack: alloc::vec![], scratch_state_builder: StateBuilderEmpty::new(), state_saver: StateSaver::none(), memory_usage_state: 0, clear_count: 0, + bytes_searched: 0, + progress: None, }; + debug!("pre-init lazy DFA cache size: {}", cache.memory_usage()); Lazy { dfa, cache: &mut cache }.init_cache(); + debug!("post-init lazy DFA cache size: {}", cache.memory_usage()); cache } @@ -1852,7 +1874,8 @@ impl Cache { /// This shows how to re-purpose a cache for use with a different DFA. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa1 = DFA::new(r"\w")?; /// let dfa2 = DFA::new(r"\W")?; @@ -1860,7 +1883,7 @@ impl Cache { /// let mut cache = dfa1.create_cache(); /// assert_eq!( /// Some(HalfMatch::must(0, 2)), - /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?, + /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?, /// ); /// /// // Using 'cache' with dfa2 is not allowed. It may result in panics or @@ -1872,7 +1895,7 @@ impl Cache { /// cache.reset(&dfa2); /// assert_eq!( /// Some(HalfMatch::must(0, 3)), - /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?, + /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?, /// ); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -1881,6 +1904,69 @@ impl Cache { Lazy::new(dfa, self).reset_cache() } + /// Initializes a new search starting at the given position. + /// + /// If a previous search was unfinished, then it is finished automatically + /// and a new search is begun. + /// + /// Note that keeping track of search progress is _not necessary_ + /// for correct implementations of search using a lazy DFA. Keeping + /// track of search progress is only necessary if you want the + /// [`Config::minimum_bytes_per_state`] configuration knob to work. + #[inline] + pub fn search_start(&mut self, at: usize) { + // If a previous search wasn't marked as finished, then finish it + // now automatically. + if let Some(p) = self.progress.take() { + self.bytes_searched += p.len(); + } + self.progress = Some(SearchProgress { start: at, at }); + } + + /// Updates the current search to indicate that it has search to the + /// current position. + /// + /// No special care needs to be taken for reverse searches. Namely, the + /// position given may be _less than_ the starting position of the search. + /// + /// # Panics + /// + /// This panics if no search has been started by [`Cache::search_start`]. + #[inline] + pub fn search_update(&mut self, at: usize) { + let p = + self.progress.as_mut().expect("no in-progress search to update"); + p.at = at; + } + + /// Indicates that a search has finished at the given position. + /// + /// # Panics + /// + /// This panics if no search has been started by [`Cache::search_start`]. + #[inline] + pub fn search_finish(&mut self, at: usize) { + let mut p = + self.progress.take().expect("no in-progress search to finish"); + p.at = at; + self.bytes_searched += p.len(); + } + + /// Returns the total number of bytes that have been searched since this + /// cache was last cleared. + /// + /// This is useful for determining the efficiency of the cache. For + /// example, the lazy DFA uses this value in conjunction with the + /// [`Config::minimum_bytes_per_state`] knob to help determine whether it + /// should quit searching. + /// + /// This always returns `0` if search progress isn't being tracked. Note + /// that the lazy DFA search routines in this crate always track search + /// progress. + pub fn search_total_len(&self) -> usize { + self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len()) + } + /// Returns the total number of times this cache has been cleared since it /// was either created or last reset. /// @@ -1899,6 +1985,9 @@ impl Cache { const ID_SIZE: usize = size_of::<LazyStateID>(); const STATE_SIZE: usize = size_of::<State>(); + // NOTE: If you make changes to the below, then + // 'minimum_cache_capacity' should be updated correspondingly. + self.trans.len() * ID_SIZE + self.starts.len() * ID_SIZE + self.states.len() * STATE_SIZE @@ -1912,6 +2001,32 @@ impl Cache { } } +/// Keeps track of the progress of the current search. +/// +/// This is updated via the `Cache::search_{start,update,finish}` APIs to +/// record how many bytes have been searched. This permits computing a +/// heuristic that represents the efficiency of a cache, and thus helps inform +/// whether the lazy DFA should give up or not. +#[derive(Clone, Debug)] +struct SearchProgress { + start: usize, + at: usize, +} + +impl SearchProgress { + /// Returns the length, in bytes, of this search so far. + /// + /// This automatically handles the case of a reverse search, where `at` + /// is likely to be less than `start`. + fn len(&self) -> usize { + if self.start <= self.at { + self.at - self.start + } else { + self.start - self.at + } + } +} + /// A map from states to state identifiers. When using std, we use a standard /// hashmap, since it's a bit faster for this use case. (Other maps, like /// one's based on FNV, have not yet been benchmarked.) @@ -1960,6 +2075,7 @@ impl<'i, 'c> Lazy<'i, 'c> { /// /// With 'inline(never)' hyperfine reports 1.1s per run. With /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement. + #[cold] #[inline(never)] fn cache_next_state( &mut self, @@ -1969,8 +2085,8 @@ impl<'i, 'c> Lazy<'i, 'c> { let stride2 = self.dfa.stride2(); let empty_builder = self.get_state_builder(); let builder = determinize::next( - &self.dfa.nfa, - self.dfa.match_kind, + self.dfa.get_nfa(), + self.dfa.get_config().get_match_kind(), &mut self.cache.sparses, &mut self.cache.stack, &self.cache.states[current.as_usize_untagged() >> stride2], @@ -2002,26 +2118,32 @@ impl<'i, 'c> Lazy<'i, 'c> { /// /// If caching this state would otherwise result in a cache that has been /// cleared too many times, then an error is returned. + #[cold] + #[inline(never)] fn cache_start_group( &mut self, - pattern_id: Option<PatternID>, + input: &Input<'_>, start: Start, - ) -> Result<LazyStateID, CacheError> { - let nfa_start_id = match pattern_id { - Some(pid) => { - assert!( - self.dfa.starts_for_each_pattern, - "attempted to search for a specific pattern \ - without enabling starts_for_each_pattern", - ); - self.dfa.nfa.start_pattern(pid) + ) -> Result<LazyStateID, MatchError> { + let mode = input.get_anchored(); + let nfa_start_id = match mode { + Anchored::No => self.dfa.get_nfa().start_unanchored(), + Anchored::Yes => self.dfa.get_nfa().start_anchored(), + Anchored::Pattern(pid) => { + if !self.dfa.get_config().get_starts_for_each_pattern() { + return Err(MatchError::unsupported_anchored(mode)); + } + match self.dfa.get_nfa().start_pattern(pid) { + None => return Ok(self.as_ref().dead_id()), + Some(sid) => sid, + } } - None if self.dfa.anchored => self.dfa.nfa.start_anchored(), - None => self.dfa.nfa.start_unanchored(), }; - let id = self.cache_start_one(nfa_start_id, start)?; - self.set_start_state(pattern_id, start, id); + let id = self + .cache_start_one(nfa_start_id, start) + .map_err(|_| MatchError::gave_up(input.start()))?; + self.set_start_state(input, start, id); Ok(id) } @@ -2042,22 +2164,33 @@ impl<'i, 'c> Lazy<'i, 'c> { start: Start, ) -> Result<LazyStateID, CacheError> { let mut builder_matches = self.get_state_builder().into_matches(); - determinize::set_lookbehind_from_start(&start, &mut builder_matches); + determinize::set_lookbehind_from_start( + self.dfa.get_nfa(), + &start, + &mut builder_matches, + ); self.cache.sparses.set1.clear(); determinize::epsilon_closure( - self.dfa.nfa.borrow(), + self.dfa.get_nfa(), nfa_start_id, - *builder_matches.look_have(), + builder_matches.look_have(), &mut self.cache.stack, &mut self.cache.sparses.set1, ); let mut builder = builder_matches.into_nfa(); determinize::add_nfa_states( - self.dfa.nfa.borrow(), + &self.dfa.get_nfa(), &self.cache.sparses.set1, &mut builder, ); - self.add_builder_state(builder, |id| id.to_start()) + let tag_starts = self.dfa.get_config().get_specialize_start_states(); + self.add_builder_state(builder, |id| { + if tag_starts { + id.to_start() + } else { + id + } + }) } /// Either add the given builder state to this cache, or return an ID to an @@ -2164,7 +2297,7 @@ impl<'i, 'c> Lazy<'i, 'c> { /// clearings, then this will return a cache error. In this case, /// callers should bubble this up as the cache can't be used until it is /// reset. Implementations of search should convert this error into a - /// `MatchError::GaveUp`. + /// [`MatchError::gave_up`]. /// /// If 'self.state_saver' is set to save a state, then this state is /// persisted through cache clearing. Otherwise, the cache is returned to @@ -2175,21 +2308,68 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Otherwise, any lazy state ID generated by the cache prior to resetting /// it is invalid after the reset. fn try_clear_cache(&mut self) -> Result<(), CacheError> { - // Currently, the only heuristic we use is the minimum cache clear - // count. If we pass that minimum, then we give up. - // - // It would be good to also add a heuristic based on "bytes searched - // per generated state," but this requires API design work. Namely, - // we really do not want to add a counter increment to the transition - // function, which implies we need to expose APIs to update the number - // of bytes searched by implementers of the search routines. And that - // doesn't seem great... But we should do it if this heuristic isn't - // enough. (The original lazy DFA implementation in the 'regex' crate - // had this heuristic, since the lazy DFA was coupled with the search - // routines.) - if let Some(min_count) = self.dfa.minimum_cache_clear_count { + let c = self.dfa.get_config(); + if let Some(min_count) = c.get_minimum_cache_clear_count() { if self.cache.clear_count >= min_count { - return Err(CacheError::too_many_cache_clears()); + if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() { + let len = self.cache.search_total_len(); + let min_bytes = + min_bytes_per.saturating_mul(self.cache.states.len()); + // If we've searched 0 bytes then probably something has + // gone wrong and the lazy DFA search implementation isn't + // correctly updating the search progress state. + if len == 0 { + trace!( + "number of bytes searched is 0, but \ + a minimum bytes per state searched ({}) is \ + enabled, maybe Cache::search_update \ + is not being used?", + min_bytes_per, + ); + } + if len < min_bytes { + trace!( + "lazy DFA cache has been cleared {} times, \ + which exceeds the limit of {}, \ + AND its bytes searched per state is less \ + than the configured minimum of {}, \ + therefore lazy DFA is giving up \ + (bytes searched since cache clear = {}, \ + number of states = {})", + self.cache.clear_count, + min_count, + min_bytes_per, + len, + self.cache.states.len(), + ); + return Err(CacheError::bad_efficiency()); + } else { + trace!( + "lazy DFA cache has been cleared {} times, \ + which exceeds the limit of {}, \ + AND its bytes searched per state is greater \ + than the configured minimum of {}, \ + therefore lazy DFA is continuing! \ + (bytes searched since cache clear = {}, \ + number of states = {})", + self.cache.clear_count, + min_count, + min_bytes_per, + len, + self.cache.states.len(), + ); + } + } else { + trace!( + "lazy DFA cache has been cleared {} times, \ + which exceeds the limit of {}, \ + since there is no configured bytes per state \ + minimum, lazy DFA is giving up", + self.cache.clear_count, + min_count, + ); + return Err(CacheError::too_many_cache_clears()); + } } } self.clear_cache(); @@ -2209,18 +2389,13 @@ impl<'i, 'c> Lazy<'i, 'c> { // If a new DFA is used, it might have a different number of NFA // states, so we need to make sure our sparse sets have the appropriate // size. - self.cache.sparses.resize(self.dfa.nfa.len()); + self.cache.sparses.resize(self.dfa.get_nfa().states().len()); self.cache.clear_count = 0; + self.cache.progress = None; } /// Clear the cache used by this lazy DFA. /// - /// If clearing the cache exceeds the minimum number of required cache - /// clearings, then this will return a cache error. In this case, - /// callers should bubble this up as the cache can't be used until it is - /// reset. Implementations of search should convert this error into a - /// `MatchError::GaveUp`. - /// /// If 'self.state_saver' is set to save a state, then this state is /// persisted through cache clearing. Otherwise, the cache is returned to /// its state after initialization with two exceptions: its clear count @@ -2236,6 +2411,10 @@ impl<'i, 'c> Lazy<'i, 'c> { self.cache.states_to_id.clear(); self.cache.memory_usage_state = 0; self.cache.clear_count += 1; + self.cache.bytes_searched = 0; + if let Some(ref mut progress) = self.cache.progress { + progress.start = progress.at; + } trace!( "lazy DFA cache has been cleared (count: {})", self.cache.clear_count @@ -2260,6 +2439,10 @@ impl<'i, 'c> Lazy<'i, 'c> { let new_id = self .add_state(state, |id| { if old_id.is_start() { + // We don't need to consult the + // 'specialize_start_states' config knob here, because + // if it's disabled, old_id.is_start() will never + // return true. id.to_start() } else { id @@ -2282,9 +2465,13 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Primarily, this adds the three sentinel states and allocates some /// initial memory. fn init_cache(&mut self) { - let mut starts_len = Start::count(); - if self.dfa.starts_for_each_pattern { - starts_len += Start::count() * self.dfa.pattern_count(); + // Why multiply by 2 here? Because we make room for both the unanchored + // and anchored start states. Unanchored is first and then anchored. + let mut starts_len = Start::len().checked_mul(2).unwrap(); + // ... but if we also want start states for every pattern, we make room + // for that too. + if self.dfa.get_config().get_starts_for_each_pattern() { + starts_len += Start::len() * self.dfa.pattern_len(); } self.cache .starts @@ -2357,7 +2544,7 @@ impl<'i, 'c> Lazy<'i, 'c> { /// Set all transitions on the state 'from' to 'to'. fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) { - for unit in self.dfa.classes.representatives() { + for unit in self.dfa.classes.representatives(..) { self.set_transition(from, unit, to); } } @@ -2387,22 +2574,23 @@ impl<'i, 'c> Lazy<'i, 'c> { /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, - pattern_id: Option<PatternID>, + input: &Input<'_>, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); - let index = match pattern_id { - None => start_index, - Some(pid) => { + let index = match input.get_anchored() { + Anchored::No => start_index, + Anchored::Yes => Start::len() + start_index, + Anchored::Pattern(pid) => { assert!( - self.dfa.starts_for_each_pattern, + self.dfa.get_config().get_starts_for_each_pattern(), "attempted to search for a specific pattern \ without enabling starts_for_each_pattern", ); let pid = pid.as_usize(); - Start::count() + (Start::count() * pid) + start_index + (2 * Start::len()) + (Start::len() * pid) + start_index } }; self.cache.starts[index] = id; @@ -2451,25 +2639,30 @@ impl<'i, 'c> LazyRef<'i, 'c> { /// /// If the start state has not yet been computed, then this returns an /// unknown lazy state ID. + #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, - pattern_id: Option<PatternID>, + input: &Input<'_>, start: Start, - ) -> LazyStateID { + ) -> Result<LazyStateID, MatchError> { let start_index = start.as_usize(); - let index = match pattern_id { - None => start_index, - Some(pid) => { - let pid = pid.as_usize(); - assert!( - pid < self.dfa.pattern_count(), - "invalid pattern ID: {:?}", - pid - ); - Start::count() + (Start::count() * pid) + start_index + let mode = input.get_anchored(); + let index = match mode { + Anchored::No => start_index, + Anchored::Yes => Start::len() + start_index, + Anchored::Pattern(pid) => { + if !self.dfa.get_config().get_starts_for_each_pattern() { + return Err(MatchError::unsupported_anchored(mode)); + } + if pid.as_usize() >= self.dfa.pattern_len() { + return Ok(self.dead_id()); + } + (2 * Start::len()) + + (Start::len() * pid.as_usize()) + + start_index } }; - self.cache.starts[index] + Ok(self.cache.starts[index]) } /// Return the cached NFA/DFA powerset state for the given ID. @@ -2530,6 +2723,11 @@ impl<'i, 'c> LazyRef<'i, 'c> { fn state_fits_in_cache(&self, state: &State) -> bool { let needed = self.cache.memory_usage() + self.memory_usage_for_one_more_state(state.memory_usage()); + trace!( + "lazy DFA cache capacity check: {:?} ?<=? {:?}", + needed, + self.dfa.cache_capacity + ); needed <= self.dfa.cache_capacity } @@ -2573,7 +2771,7 @@ enum StateSaver { /// is stored in 'Saved' since it may have changed. ToSave { id: LazyStateID, state: State }, /// An ID that of a state that has been persisted through a lazy DFA - /// cache clearing. The ID recorded here corresonds to an ID that was + /// cache clearing. The ID recorded here corresponds to an ID that was /// once marked as ToSave. The IDs are likely not equivalent even though /// the states they point to are. Saved(LazyStateID), @@ -2620,14 +2818,11 @@ impl StateSaver { /// A lazy DFA configuration is a simple data object that is typically used /// with [`Builder::configure`]. /// -/// The default configuration guarantees that a search will _never_ return -/// a [`MatchError`] for any haystack or pattern. Setting a quit byte with -/// [`Config::quit`], enabling heuristic support for Unicode word boundaries -/// with [`Config::unicode_word_boundary`], or setting a minimum cache clear -/// count with [`Config::minimum_cache_clear_count`] can in turn cause a search -/// to return an error. See the corresponding configuration options for more -/// details on when those error conditions arise. -#[derive(Clone, Copy, Debug, Default)] +/// The default configuration guarantees that a search will never return a +/// "gave up" or "quit" error, although it is possible for a search to fail +/// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by +/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. +#[derive(Clone, Debug, Default)] pub struct Config { // As with other configuration types in this crate, we put all our knobs // in options so that we can distinguish between "default" and "not set." @@ -2636,15 +2831,17 @@ pub struct Config { // 'overwrite' method. // // For docs on the fields below, see the corresponding method setters. - anchored: Option<bool>, match_kind: Option<MatchKind>, + pre: Option<Option<Prefilter>>, starts_for_each_pattern: Option<bool>, byte_classes: Option<bool>, unicode_word_boundary: Option<bool>, quitset: Option<ByteSet>, + specialize_start_states: Option<bool>, cache_capacity: Option<usize>, skip_cache_capacity_check: Option<bool>, minimum_cache_clear_count: Option<Option<usize>>, + minimum_bytes_per_state: Option<Option<usize>>, } impl Config { @@ -2653,116 +2850,6 @@ impl Config { Config::default() } - /// Set whether matching must be anchored at the beginning of the input. - /// - /// When enabled, a match must begin at the start of a search. When - /// disabled (the default), the lazy DFA will act as if the pattern started - /// with a `(?s:.)*?`, which enables a match to appear anywhere. - /// - /// Note that if you want to run both anchored and unanchored - /// searches without building multiple automatons, you can enable the - /// [`Config::starts_for_each_pattern`] configuration instead. This will - /// permit unanchored any-pattern searches and pattern-specific anchored - /// searches. See the documentation for that configuration for an example. - /// - /// By default this is disabled. - /// - /// **WARNING:** this is subtly different than using a `^` at the start of - /// your regex. A `^` forces a regex to match exclusively at the start of - /// input, regardless of where you begin your search. In contrast, enabling - /// this option will allow your regex to match anywhere in your input, - /// but the match must start at the beginning of a search. (Most of the - /// higher level convenience search routines make "start of input" and - /// "start of search" equivalent, but some routines allow treating these as - /// orthogonal.) - /// - /// For example, consider the haystack `aba` and the following searches: - /// - /// 1. The regex `^a` is compiled with `anchored=false` and searches - /// `aba` starting at position `2`. Since `^` requires the match to - /// start at the beginning of the input and `2 > 0`, no match is found. - /// 2. The regex `a` is compiled with `anchored=true` and searches `aba` - /// starting at position `2`. This reports a match at `[2, 3]` since - /// the match starts where the search started. Since there is no `^`, - /// there is no requirement for the match to start at the beginning of - /// the input. - /// 3. The regex `a` is compiled with `anchored=true` and searches `aba` - /// starting at position `1`. Since `b` corresponds to position `1` and - /// since the regex is anchored, it finds no match. - /// 4. The regex `a` is compiled with `anchored=false` and searches `aba` - /// startting at position `1`. Since the regex is neither anchored nor - /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?` - /// prefix that permits it to match anywhere. Thus, it reports a match - /// at `[2, 3]`. - /// - /// # Example - /// - /// This demonstrates the differences between an anchored search and - /// a pattern that begins with `^` (as described in the above warning - /// message). - /// - /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch}; - /// - /// let haystack = "aba".as_bytes(); - /// - /// let dfa = DFA::builder() - /// .configure(DFA::config().anchored(false)) // default - /// .build(r"^a")?; - /// let mut cache = dfa.create_cache(); - /// let got = dfa.find_leftmost_fwd_at( - /// &mut cache, None, None, haystack, 2, 3, - /// )?; - /// // No match is found because 2 is not the beginning of the haystack, - /// // which is what ^ requires. - /// let expected = None; - /// assert_eq!(expected, got); - /// - /// let dfa = DFA::builder() - /// .configure(DFA::config().anchored(true)) - /// .build(r"a")?; - /// let mut cache = dfa.create_cache(); - /// let got = dfa.find_leftmost_fwd_at( - /// &mut cache, None, None, haystack, 2, 3, - /// )?; - /// // An anchored search can still match anywhere in the haystack, it just - /// // must begin at the start of the search which is '2' in this case. - /// let expected = Some(HalfMatch::must(0, 3)); - /// assert_eq!(expected, got); - /// - /// let dfa = DFA::builder() - /// .configure(DFA::config().anchored(true)) - /// .build(r"a")?; - /// let mut cache = dfa.create_cache(); - /// let got = dfa.find_leftmost_fwd_at( - /// &mut cache, None, None, haystack, 1, 3, - /// )?; - /// // No match is found since we start searching at offset 1 which - /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match - /// // is found. - /// let expected = None; - /// assert_eq!(expected, got); - /// - /// let dfa = DFA::builder() - /// .configure(DFA::config().anchored(false)) - /// .build(r"a")?; - /// let mut cache = dfa.create_cache(); - /// let got = dfa.find_leftmost_fwd_at( - /// &mut cache, None, None, haystack, 1, 3, - /// )?; - /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the - /// // pattern. Even though the search starts at 'b', the 'match anything' - /// // prefix allows the search to match 'a'. - /// let expected = Some(HalfMatch::must(0, 3)); - /// assert_eq!(expected, got); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn anchored(mut self, yes: bool) -> Config { - self.anchored = Some(yes); - self - } - /// Set the desired match semantics. /// /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the @@ -2789,21 +2876,24 @@ impl Config { /// report overlapping matches. /// /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ - /// hybrid::{dfa::DFA, OverlappingState}, - /// HalfMatch, MatchKind, + /// hybrid::dfa::{DFA, OverlappingState}, + /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build_many(&[r"\w+$", r"\S+$"])?; /// let mut cache = dfa.create_cache(); - /// let haystack = "@foo".as_bytes(); + /// let haystack = "@foo"; /// let mut state = OverlappingState::start(); /// /// let expected = Some(HalfMatch::must(1, 4)); - /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; - /// assert_eq!(expected, got); + /// dfa.try_search_overlapping_fwd( + /// &mut cache, &Input::new(haystack), &mut state, + /// )?; + /// assert_eq!(expected, state.get_match()); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first @@ -2811,8 +2901,10 @@ impl Config { /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(HalfMatch::must(0, 4)); - /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?; - /// assert_eq!(expected, got); + /// dfa.try_search_overlapping_fwd( + /// &mut cache, &Input::new(haystack), &mut state, + /// )?; + /// assert_eq!(expected, state.get_match()); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` @@ -2829,36 +2921,38 @@ impl Config { /// for you, so it's usually not necessary to do this yourself. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchKind}; + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson::NFA, + /// Anchored, HalfMatch, Input, MatchKind, + /// }; /// - /// let haystack = "123foobar456".as_bytes(); - /// let pattern = r"[a-z]+"; + /// let input = Input::new("123foobar456"); + /// let pattern = r"[a-z]+r"; /// /// let dfa_fwd = DFA::new(pattern)?; /// let dfa_rev = DFA::builder() - /// .configure(DFA::config() - /// .anchored(true) - /// .match_kind(MatchKind::All) - /// ) + /// .thompson(NFA::config().reverse(true)) + /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build(pattern)?; /// let mut cache_fwd = dfa_fwd.create_cache(); /// let mut cache_rev = dfa_rev.create_cache(); /// /// let expected_fwd = HalfMatch::must(0, 9); /// let expected_rev = HalfMatch::must(0, 3); - /// let got_fwd = dfa_fwd.find_leftmost_fwd( - /// &mut cache_fwd, haystack, - /// )?.unwrap(); + /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap(); /// // Here we don't specify the pattern to search for since there's only /// // one pattern and we're doing a leftmost search. But if this were an /// // overlapping search, you'd need to specify the pattern that matched /// // in the forward direction. (Otherwise, you might wind up finding the /// // starting position of a match of some other pattern.) That in turn /// // requires building the reverse automaton with starts_for_each_pattern - /// // enabled. Indeed, this is what Regex does internally. - /// let got_rev = dfa_rev.find_leftmost_rev_at( - /// &mut cache_rev, None, haystack, 0, got_fwd.offset(), - /// )?.unwrap(); + /// // enabled. + /// let input = input + /// .clone() + /// .range(..got_fwd.offset()) + /// .anchored(Anchored::Yes); + /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap(); /// assert_eq!(expected_fwd, got_fwd); /// assert_eq!(expected_rev, got_rev); /// @@ -2869,6 +2963,86 @@ impl Config { self } + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// Note that unless [`Config::specialize_start_states`] has been + /// explicitly set, then setting this will also enable (when `pre` is + /// `Some`) or disable (when `pre` is `None`) start state specialization. + /// This occurs because without start state specialization, a prefilter + /// is likely to be less effective. And without a prefilter, start state + /// specialization is usually pointless. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = DFA::builder() + /// .configure(DFA::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// Some(HalfMatch::must(0, 11)), + /// re.try_search_fwd(&mut cache, &input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = DFA::builder() + /// .configure(DFA::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// // No match reported even though there clearly is one! + /// None, + /// re.try_search_fwd(&mut cache, &input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); + if self.specialize_start_states.is_none() { + self.specialize_start_states = + Some(self.get_prefilter().is_some()); + } + self + } + /// Whether to compile a separate start state for each pattern in the /// lazy DFA. /// @@ -2897,50 +3071,45 @@ impl Config { /// for matches of any pattern or to search for anchored matches of one /// particular pattern while using the same DFA. (Otherwise, you would need /// to compile a new DFA for each pattern.) - /// 3. Since the start states added for each pattern are anchored, if you - /// compile an unanchored DFA with one pattern while also enabling this - /// option, then you can use the same DFA to perform anchored or unanchored - /// searches. The latter you get with the standard search APIs. The former - /// you get from the various `_at` search methods that allow you specify a - /// pattern ID to search for. /// /// By default this is disabled. /// /// # Example /// /// This example shows how to use this option to permit the same lazy DFA - /// to run both anchored and unanchored searches for a single pattern. + /// to run both general searches for any pattern and anchored searches for + /// a specific pattern. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, PatternID}; + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// Anchored, HalfMatch, Input, PatternID, + /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().starts_for_each_pattern(true)) - /// .build(r"foo[0-9]+")?; + /// .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?; /// let mut cache = dfa.create_cache(); - /// let haystack = b"quux foo123"; - /// - /// // Here's a normal unanchored search. Notice that we use 'None' for the - /// // pattern ID. Since the DFA was built as an unanchored machine, it - /// // uses its default unanchored starting state. - /// let expected = HalfMatch::must(0, 11); - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( - /// &mut cache, None, None, haystack, 0, haystack.len(), - /// )?); - /// // But now if we explicitly specify the pattern to search ('0' being - /// // the only pattern in the DFA), then it will use the starting state - /// // for that specific pattern which is always anchored. Since the - /// // pattern doesn't have a match at the beginning of the haystack, we - /// // find nothing. - /// assert_eq!(None, dfa.find_leftmost_fwd_at( - /// &mut cache, None, Some(PatternID::must(0)), haystack, 0, haystack.len(), - /// )?); - /// // And finally, an anchored search is not the same as putting a '^' at - /// // beginning of the pattern. An anchored search can only match at the - /// // beginning of the *search*, which we can change: - /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at( - /// &mut cache, None, Some(PatternID::must(0)), haystack, 5, haystack.len(), - /// )?); + /// let haystack = "bar foo123"; + /// + /// // Here's a normal unanchored search that looks for any pattern. + /// let expected = HalfMatch::must(0, 10); + /// let input = Input::new(haystack); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); + /// // We can also do a normal anchored search for any pattern. Since it's + /// // an anchored search, we position the start of the search where we + /// // know the match will begin. + /// let expected = HalfMatch::must(0, 10); + /// let input = Input::new(haystack).range(4..); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); + /// // Since we compiled anchored start states for each pattern, we can + /// // also look for matches of other patterns explicitly, even if a + /// // different pattern would have normally matched. + /// let expected = HalfMatch::must(1, 10); + /// let input = Input::new(haystack) + /// .range(4..) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` @@ -2990,7 +3159,7 @@ impl Config { /// When set, this will attempt to implement Unicode word boundaries as if /// they were ASCII word boundaries. This only works when the search input /// is ASCII only. If a non-ASCII byte is observed while searching, then a - /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned. + /// [`MatchError::quit`] error is returned. /// /// A possible alternative to enabling this option is to simply use an /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this @@ -3014,8 +3183,7 @@ impl Config { /// When using a [`Regex`](crate::hybrid::regex::Regex), this /// corresponds to using the `try_` suite of methods. Alternatively, /// if callers can guarantee that their input is ASCII only, then a - /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be - /// returned while searching. + /// [`MatchError::quit`] error will never be returned while searching. /// /// This is disabled by default. /// @@ -3028,7 +3196,7 @@ impl Config { /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, - /// HalfMatch, MatchError, MatchKind, + /// HalfMatch, Input, MatchError, /// }; /// /// let dfa = DFA::builder() @@ -3038,9 +3206,9 @@ impl Config { /// /// // The match occurs before the search ever observes the snowman /// // character, so no error occurs. - /// let haystack = "foo 123 ☃".as_bytes(); + /// let haystack = "foo 123 ☃"; /// let expected = Some(HalfMatch::must(0, 7)); - /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; /// assert_eq!(expected, got); /// /// // Notice that this search fails, even though the snowman character @@ -3048,9 +3216,23 @@ impl Config { /// // routines read one byte past the end of the search to account for /// // look-around, and indeed, this is required here to determine whether /// // the trailing \b matches. - /// let haystack = "foo 123☃".as_bytes(); - /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 }; - /// let got = dfa.find_leftmost_fwd(&mut cache, haystack); + /// let haystack = "foo 123 ☃"; + /// let expected = MatchError::quit(0xE2, 8); + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack)); + /// assert_eq!(Err(expected), got); + /// + /// // Another example is executing a search where the span of the haystack + /// // we specify is all ASCII, but there is non-ASCII just before it. This + /// // correctly also reports an error. + /// let input = Input::new("β123").range(2..); + /// let expected = MatchError::quit(0xB2, 1); + /// let got = dfa.try_search_fwd(&mut cache, &input); + /// assert_eq!(Err(expected), got); + /// + /// // And similarly for the trailing word boundary. + /// let input = Input::new("123β").range(..3); + /// let expected = MatchError::quit(0xCE, 3); + /// let got = dfa.try_search_fwd(&mut cache, &input); /// assert_eq!(Err(expected), got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -3066,9 +3248,9 @@ impl Config { /// Add a "quit" byte to the lazy DFA. /// - /// When a quit byte is seen during search time, then search will return - /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the - /// offset at which the search stopped. + /// When a quit byte is seen during search time, then search will return a + /// [`MatchError::quit`] error indicating the offset at which the search + /// stopped. /// /// A quit byte will always overrule any other aspects of a regex. For /// example, if the `x` byte is added as a quit byte and the regex `\w` is @@ -3109,19 +3291,23 @@ impl Config { /// a user supplied pattern from matching across a line boundary. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; /// /// let dfa = DFA::builder() /// .configure(DFA::config().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// let mut cache = dfa.create_cache(); /// - /// let haystack = "foo\nbar".as_bytes(); + /// let haystack = "foo\nbar"; /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. - /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; - /// let got = dfa.find_leftmost_fwd(&mut cache, haystack).unwrap_err(); + /// let expected = MatchError::quit(b'\n', 3); + /// let got = dfa.try_search_fwd( + /// &mut cache, + /// &Input::new(haystack), + /// ).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -3144,6 +3330,89 @@ impl Config { self } + /// Enable specializing start states in the lazy DFA. + /// + /// When start states are specialized, an implementor of a search routine + /// using a lazy DFA can tell when the search has entered a starting state. + /// When start states aren't specialized, then it is impossible to know + /// whether the search has entered a start state. + /// + /// Ideally, this option wouldn't need to exist and we could always + /// specialize start states. The problem is that start states can be quite + /// active. This in turn means that an efficient search routine is likely + /// to ping-pong between a heavily optimized hot loop that handles most + /// states and to a less optimized specialized handling of start states. + /// This causes branches to get heavily mispredicted and overall can + /// materially decrease throughput. Therefore, specializing start states + /// should only be enabled when it is needed. + /// + /// Knowing whether a search is in a start state is typically useful when a + /// prefilter is active for the search. A prefilter is typically only run + /// when in a start state and a prefilter can greatly accelerate a search. + /// Therefore, the possible cost of specializing start states is worth it + /// in this case. Otherwise, if you have no prefilter, there is likely no + /// reason to specialize start states. + /// + /// This is disabled by default, but note that it is automatically + /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless + /// `specialize_start_states` has already been set, [`Config::prefilter`] + /// will automatically enable or disable it based on whether a prefilter + /// is present or not, respectively. This is done because a prefilter's + /// effectiveness is rooted in being executed whenever the DFA is in a + /// start state, and that's only possible to do when they are specialized. + /// + /// Note that it is plausibly reasonable to _disable_ this option + /// explicitly while _enabling_ a prefilter. In that case, a prefilter + /// will still be run at the beginning of a search, but never again. This + /// in theory could strike a good balance if you're in a situation where a + /// prefilter is likely to produce many false positive candidates. + /// + /// # Example + /// + /// This example shows how to enable start state specialization and then + /// shows how to check whether a state is a start state or not. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; + /// + /// let dfa = DFA::builder() + /// .configure(DFA::config().specialize_start_states(true)) + /// .build(r"[a-z]+")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "123 foobar 4567".as_bytes(); + /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; + /// // The ID returned by 'start_state_forward' will always be tagged as + /// // a start state when start state specialization is enabled. + /// assert!(sid.is_tagged()); + /// assert!(sid.is_start()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Compare the above with the default lazy DFA configuration where + /// start states are _not_ specialized. In this case, the start state + /// is not tagged and `sid.is_start()` returns false. + /// + /// ``` + /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; + /// + /// let dfa = DFA::new(r"[a-z]+")?; + /// let mut cache = dfa.create_cache(); + /// + /// let haystack = "123 foobar 4567".as_bytes(); + /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; + /// // Start states are not tagged in the default configuration! + /// assert!(!sid.is_tagged()); + /// assert!(!sid.is_start()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn specialize_start_states(mut self, yes: bool) -> Config { + self.specialize_start_states = Some(yes); + self + } + /// Sets the maximum amount of heap memory, in bytes, to allocate to the /// cache for use during a lazy DFA search. If the lazy DFA would otherwise /// use more heap memory, then, depending on other configuration knobs, @@ -3157,7 +3426,7 @@ impl Config { /// /// Note that while building a lazy DFA will do a "minimum" check to ensure /// the capacity is big enough, this is more or less about correctness. - /// If the cache is bigger than the minimum but still too small, then the + /// If the cache is bigger than the minimum but still "too small," then the /// lazy DFA could wind up spending a lot of time clearing the cache and /// recomputing transitions, thus negating the performance benefits of a /// lazy DFA. Thus, setting the cache capacity is mostly an experimental @@ -3175,7 +3444,8 @@ impl Config { /// a smaller cache capacity. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let pattern = r"\p{L}{1000}"; /// @@ -3191,7 +3461,7 @@ impl Config { /// /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); /// let expected = Some(HalfMatch::must(0, 2000)); - /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?; + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -3211,7 +3481,11 @@ impl Config { /// while a minimum cache capacity does permit the lazy DFA to function /// where it otherwise couldn't, it's plausible that it may not function /// well if it's constantly running out of room. In that case, the speed - /// advantages of the lazy DFA may be negated. + /// advantages of the lazy DFA may be negated. On the other hand, the + /// "minimum" cache capacity computed may not be completely accurate and + /// could actually be bigger than what is really necessary. Therefore, it + /// is plausible that using the minimum cache capacity could still result + /// in very good performance. /// /// This is disabled by default. /// @@ -3224,7 +3498,8 @@ impl Config { /// too small. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let pattern = r"\p{L}{1000}"; /// @@ -3241,7 +3516,7 @@ impl Config { /// /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); /// let expected = Some(HalfMatch::must(0, 2000)); - /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?; + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -3254,23 +3529,27 @@ impl Config { /// Configure a lazy DFA search to quit after a certain number of cache /// clearings. /// - /// When a minimum is set, then a lazy DFA search will "give up" after - /// the minimum number of cache clearings has occurred. This is typically - /// useful in scenarios where callers want to detect whether the lazy DFA - /// search is "efficient" or not. If the cache is cleared too many times, - /// this is a good indicator that it is not efficient, and thus, the caller - /// may wish to use some other regex engine. + /// When a minimum is set, then a lazy DFA search will *possibly* "give + /// up" after the minimum number of cache clearings has occurred. This is + /// typically useful in scenarios where callers want to detect whether the + /// lazy DFA search is "efficient" or not. If the cache is cleared too many + /// times, this is a good indicator that it is not efficient, and thus, the + /// caller may wish to use some other regex engine. /// /// Note that the number of times a cache is cleared is a property of /// the cache itself. Thus, if a cache is used in a subsequent search - /// with a similarly configured lazy DFA, then it would cause the - /// search to "give up" if the cache needed to be cleared. The cache - /// clear count can only be reset to `0` via [`DFA::reset_cache`] (or + /// with a similarly configured lazy DFA, then it could cause the + /// search to "give up" if the cache needed to be cleared, depending + /// on its internal count and configured minimum. The cache clear + /// count can only be reset to `0` via [`DFA::reset_cache`] (or /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if /// you're using the `Regex` API). /// /// By default, no minimum is configured. Thus, a lazy DFA search will - /// never give up due to cache clearings. + /// never give up due to cache clearings. If you do set this option, you + /// might consider also setting [`Config::minimum_bytes_per_state`] in + /// order for the lazy DFA to take efficiency into account before giving + /// up. /// /// # Example /// @@ -3279,13 +3558,11 @@ impl Config { /// in a search that returns an error. /// /// It is important to note that the precise mechanics of how and when - /// a cache gets cleared is an implementation detail. Thus, the asserts - /// in the tests below with respect to the particular offsets at which a - /// search gave up should be viewed strictly as a demonstration. They are - /// not part of any API guarantees offered by this crate. + /// a cache gets cleared is an implementation detail. /// /// ``` - /// use regex_automata::{hybrid::dfa::DFA, MatchError}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind}; /// /// // This is a carefully chosen regex. The idea is to pick one /// // that requires some decent number of states (hence the bounded @@ -3309,37 +3586,42 @@ impl Config { /// .build(pattern)?; /// let mut cache = dfa.create_cache(); /// + /// // Our search will give up before reaching the end! /// let haystack = "a".repeat(101).into_bytes(); - /// assert_eq!( - /// dfa.find_leftmost_fwd(&mut cache, &haystack), - /// Err(MatchError::GaveUp { offset: 25 }), - /// ); + /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); + /// assert!(matches!( + /// *result.unwrap_err().kind(), + /// MatchErrorKind::GaveUp { .. }, + /// )); /// /// // Now that we know the cache is full, if we search a haystack that we /// // know will require creating at least one new state, it should not - /// // be able to make any progress. + /// // be able to make much progress. /// let haystack = "β".repeat(101).into_bytes(); - /// assert_eq!( - /// dfa.find_leftmost_fwd(&mut cache, &haystack), - /// Err(MatchError::GaveUp { offset: 0 }), - /// ); + /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); + /// assert!(matches!( + /// *result.unwrap_err().kind(), + /// MatchErrorKind::GaveUp { .. }, + /// )); /// /// // If we reset the cache, then we should be able to create more states /// // and make more progress with searching for betas. /// cache.reset(&dfa); /// let haystack = "β".repeat(101).into_bytes(); - /// assert_eq!( - /// dfa.find_earliest_fwd(&mut cache, &haystack), - /// Err(MatchError::GaveUp { offset: 26 }), - /// ); + /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); + /// assert!(matches!( + /// *result.unwrap_err().kind(), + /// MatchErrorKind::GaveUp { .. }, + /// )); /// /// // ... switching back to ASCII still makes progress since it just needs /// // to set transitions on existing states! /// let haystack = "a".repeat(101).into_bytes(); - /// assert_eq!( - /// dfa.find_earliest_fwd(&mut cache, &haystack), - /// Err(MatchError::GaveUp { offset: 13 }), - /// ); + /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); + /// assert!(matches!( + /// *result.unwrap_err().kind(), + /// MatchErrorKind::GaveUp { .. }, + /// )); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` @@ -3348,9 +3630,48 @@ impl Config { self } - /// Returns whether this configuration has enabled anchored searches. - pub fn get_anchored(&self) -> bool { - self.anchored.unwrap_or(false) + /// Configure a lazy DFA search to quit only when its efficiency drops + /// below the given minimum. + /// + /// The efficiency of the cache is determined by the number of DFA states + /// compiled per byte of haystack searched. For example, if the efficiency + /// is 2, then it means the lazy DFA is creating a new DFA state after + /// searching approximately 2 bytes in a haystack. Generally speaking, 2 + /// is quite bad and it's likely that even a slower regex engine like the + /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster. + /// + /// This has no effect if [`Config::minimum_cache_clear_count`] is not set. + /// Namely, this option only kicks in when the cache has been cleared more + /// than the minimum number. If no minimum is set, then the cache is simply + /// cleared whenever it fills up and it is impossible for the lazy DFA to + /// quit due to ineffective use of the cache. + /// + /// In general, if one is setting [`Config::minimum_cache_clear_count`], + /// then one should probably also set this knob as well. The reason is + /// that the absolute number of times the cache is cleared is generally + /// not a great predictor of efficiency. For example, if a new DFA state + /// is created for every 1,000 bytes searched, then it wouldn't be hard + /// for the cache to get cleared more than `N` times and then cause the + /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite + /// good from a performance perspective, and it's likely that the lazy + /// DFA should continue searching, even if it requires clearing the cache + /// occasionally. + /// + /// Finally, note that if you're implementing your own lazy DFA search + /// routine and also want this efficiency check to work correctly, then + /// you'll need to use the following routines to record search progress: + /// + /// * Call [`Cache::search_start`] at the beginning of every search. + /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is + /// called. + /// * Call [`Cache::search_finish`] before completing a search. (It is + /// not strictly necessary to call this when an error is returned, as + /// `Cache::search_start` will automatically finish the previous search + /// for you. But calling it where possible before returning helps improve + /// the accuracy of how many bytes have actually been searched.) + pub fn minimum_bytes_per_state(mut self, min: Option<usize>) -> Config { + self.minimum_bytes_per_state = Some(min); + self } /// Returns the match semantics set in this configuration. @@ -3358,6 +3679,11 @@ impl Config { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } + /// Returns the prefilter set in this configuration, if one at all. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() + } + /// Returns whether this configuration has enabled anchored starting states /// for every pattern in the DFA. pub fn get_starts_for_each_pattern(&self) -> bool { @@ -3378,14 +3704,23 @@ impl Config { self.unicode_word_boundary.unwrap_or(false) } - /// Returns whether this configuration will instruct the DFA to enter a - /// quit state whenever the given byte is seen during a search. When at + /// Returns whether this configuration will instruct the lazy DFA to enter + /// a quit state whenever the given byte is seen during a search. When at /// least one byte has this enabled, it is possible for a search to return /// an error. pub fn get_quit(&self, byte: u8) -> bool { self.quitset.map_or(false, |q| q.contains(byte)) } + /// Returns whether this configuration will instruct the lazy DFA to + /// "specialize" start states. When enabled, the lazy DFA will tag start + /// states so that search routines using the lazy DFA can detect when + /// it's in a start state and do some kind of optimization (like run a + /// prefilter). + pub fn get_specialize_start_states(&self) -> bool { + self.specialize_start_states.unwrap_or(false) + } + /// Returns the cache capacity set on this configuration. pub fn get_cache_capacity(&self) -> usize { self.cache_capacity.unwrap_or(2 * (1 << 20)) @@ -3404,6 +3739,14 @@ impl Config { self.minimum_cache_clear_count.unwrap_or(None) } + /// Returns, if set, the minimum number of bytes per state that need to be + /// processed in order for the lazy DFA to keep going. If the minimum falls + /// below this number (and the cache has been cleared a minimum number of + /// times), then the lazy DFA will return a "gave up" error. + pub fn get_minimum_bytes_per_state(&self) -> Option<usize> { + self.minimum_bytes_per_state.unwrap_or(None) + } + /// Returns the minimum lazy DFA cache capacity required for the given NFA. /// /// The cache capacity required for a particular NFA may change without @@ -3449,6 +3792,11 @@ impl Config { // It is important to distinguish any "quit" bytes from all other // bytes. Otherwise, a non-quit byte may end up in the same class // as a quit byte, and thus cause the DFA stop when it shouldn't. + // + // Test case: + // + // regex-cli find hybrid regex -w @conn.json.1000x.log \ + // '^#' '\b10\.55\.182\.100\b' if !quit.is_empty() { set.add_set(&quit); } @@ -3466,7 +3814,7 @@ impl Config { nfa: &thompson::NFA, ) -> Result<ByteSet, BuildError> { let mut quit = self.quitset.unwrap_or(ByteSet::empty()); - if nfa.has_word_boundary_unicode() { + if nfa.look_set_any().contains_word_unicode() { if self.get_unicode_word_boundary() { for b in 0x80..=0xFF { quit.add(b); @@ -3491,10 +3839,10 @@ impl Config { /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. - fn overwrite(self, o: Config) -> Config { + fn overwrite(&self, o: Config) -> Config { Config { - anchored: o.anchored.or(self.anchored), match_kind: o.match_kind.or(self.match_kind), + pre: o.pre.or_else(|| self.pre.clone()), starts_for_each_pattern: o .starts_for_each_pattern .or(self.starts_for_each_pattern), @@ -3503,6 +3851,9 @@ impl Config { .unicode_word_boundary .or(self.unicode_word_boundary), quitset: o.quitset.or(self.quitset), + specialize_start_states: o + .specialize_start_states + .or(self.specialize_start_states), cache_capacity: o.cache_capacity.or(self.cache_capacity), skip_cache_capacity_check: o .skip_cache_capacity_check @@ -3510,6 +3861,9 @@ impl Config { minimum_cache_clear_count: o .minimum_cache_clear_count .or(self.minimum_cache_clear_count), + minimum_bytes_per_state: o + .minimum_bytes_per_state + .or(self.minimum_bytes_per_state), } } } @@ -3556,27 +3910,25 @@ impl Config { /// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. /// * The pattern itself is permitted to match invalid UTF-8. For example, /// things like `[^a]` that match any byte except for `a` are permitted. -/// * Unanchored patterns can search through invalid UTF-8. That is, for -/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of -/// `(?s:.)*?`. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// nfa::thompson, -/// HalfMatch, SyntaxConfig, +/// util::syntax, +/// HalfMatch, Input, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().cache_capacity(5_000)) -/// .syntax(SyntaxConfig::new().unicode(false).utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) +/// .syntax(syntax::Config::new().unicode(false).utf8(false)) /// .build(r"foo[^b]ar.*")?; /// let mut cache = dfa.create_cache(); /// /// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; /// let expected = Some(HalfMatch::must(0, 10)); -/// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; +/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -3584,7 +3936,8 @@ impl Config { #[derive(Clone, Debug)] pub struct Builder { config: Config, - thompson: thompson::Builder, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, } impl Builder { @@ -3592,7 +3945,8 @@ impl Builder { pub fn new() -> Builder { Builder { config: Config::default(), - thompson: thompson::Builder::new(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), } } @@ -3600,6 +3954,7 @@ impl Builder { /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. + #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> { self.build_many(&[pattern]) } @@ -3608,22 +3963,31 @@ impl Builder { /// /// When matches are returned, the pattern ID corresponds to the index of /// the pattern in the slice given. + #[cfg(feature = "syntax")] pub fn build_many<P: AsRef<str>>( &self, patterns: &[P], ) -> Result<DFA, BuildError> { - let nfa = - self.thompson.build_many(patterns).map_err(BuildError::nfa)?; - self.build_from_nfa(Arc::new(nfa)) + let nfa = self + .thompson + .clone() + // We can always forcefully disable captures because DFAs do not + // support them. + .configure( + thompson::Config::new() + .which_captures(thompson::WhichCaptures::None), + ) + .build_many(patterns) + .map_err(BuildError::nfa)?; + self.build_from_nfa(nfa) } /// Build a DFA from the given NFA. /// - /// Note that this requires an `Arc<thompson::NFA>` instead of a - /// `&thompson::NFA` because the lazy DFA builds itself from the NFA at - /// search time. This means that the lazy DFA must hold on to its source - /// NFA for the entirety of its lifetime. An `Arc` is used so that callers - /// aren't forced to clone the NFA if it is needed elsewhere. + /// Note that this requires owning a `thompson::NFA`. While this may force + /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs + /// are defined internally to support shared ownership such that cloning is + /// very cheap. /// /// # Example /// @@ -3631,26 +3995,29 @@ impl Builder { /// in hand. /// /// ``` - /// use std::sync::Arc; - /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch}; + /// use regex_automata::{ + /// hybrid::dfa::DFA, + /// nfa::thompson, + /// HalfMatch, Input, + /// }; /// - /// let haystack = "foo123bar".as_bytes(); + /// let haystack = "foo123bar"; /// /// // This shows how to set non-default options for building an NFA. - /// let nfa = thompson::Builder::new() - /// .configure(thompson::Config::new().shrink(false)) + /// let nfa = thompson::Compiler::new() + /// .configure(thompson::Config::new().shrink(true)) /// .build(r"[0-9]+")?; - /// let dfa = DFA::builder().build_from_nfa(Arc::new(nfa))?; + /// let dfa = DFA::builder().build_from_nfa(nfa)?; /// let mut cache = dfa.create_cache(); /// let expected = Some(HalfMatch::must(0, 6)); - /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?; + /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` pub fn build_from_nfa( &self, - nfa: Arc<thompson::NFA>, + nfa: thompson::NFA, ) -> Result<DFA, BuildError> { let quitset = self.config.quit_set_from_nfa(&nfa)?; let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); @@ -3675,12 +4042,11 @@ impl Builder { // then we simply force the cache capacity to its minimum amount // and mush on. if self.config.get_skip_cache_capacity_check() { - trace!( + debug!( "given capacity ({}) is too small, \ since skip_cache_capacity_check is enabled, \ setting cache capacity to minimum ({})", - cache_capacity, - min_cache, + cache_capacity, min_cache, ); cache_capacity = min_cache; } else { @@ -3694,22 +4060,19 @@ impl Builder { // of states in our state ID space. This is unlikely to trigger in // >=32-bit systems, but 16-bit systems have a pretty small state ID // space since a number of bits are used up as sentinels. - if let Err(err) = minimum_lazy_state_id(&nfa, &classes) { + if let Err(err) = minimum_lazy_state_id(&classes) { return Err(BuildError::insufficient_state_id_capacity(err)); } let stride2 = classes.stride2(); + let start_map = StartByteMap::new(nfa.look_matcher()); Ok(DFA { + config: self.config.clone(), nfa, stride2, + start_map, classes, quitset, - anchored: self.config.get_anchored(), - match_kind: self.config.get_match_kind(), - starts_for_each_pattern: self.config.get_starts_for_each_pattern(), cache_capacity, - minimum_cache_clear_count: self - .config - .get_minimum_cache_clear_count(), }) } @@ -3720,16 +4083,17 @@ impl Builder { } /// Set the syntax configuration for this builder using - /// [`SyntaxConfig`](crate::SyntaxConfig). + /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a lazy DFA directly from a /// pattern. + #[cfg(feature = "syntax")] pub fn syntax( &mut self, - config: crate::util::syntax::SyntaxConfig, + config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self @@ -3744,20 +4108,144 @@ impl Builder { /// /// These settings only apply when constructing a DFA directly from a /// pattern. + #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } +/// Represents the current state of an overlapping search. +/// +/// This is used for overlapping searches since they need to know something +/// about the previous search. For example, when multiple patterns match at the +/// same position, this state tracks the last reported pattern so that the next +/// search knows whether to report another matching pattern or continue with +/// the search at the next position. Additionally, it also tracks which state +/// the last search call terminated in. +/// +/// This type provides little introspection capabilities. The only thing a +/// caller can do is construct it and pass it around to permit search routines +/// to use it to track state, and also ask whether a match has been found. +/// +/// Callers should always provide a fresh state constructed via +/// [`OverlappingState::start`] when starting a new search. Reusing state from +/// a previous search may result in incorrect results. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct OverlappingState { + /// The match reported by the most recent overlapping search to use this + /// state. + /// + /// If a search does not find any matches, then it is expected to clear + /// this value. + pub(crate) mat: Option<HalfMatch>, + /// The state ID of the state at which the search was in when the call + /// terminated. When this is a match state, `last_match` must be set to a + /// non-None value. + /// + /// A `None` value indicates the start state of the corresponding + /// automaton. We cannot use the actual ID, since any one automaton may + /// have many start states, and which one is in use depends on several + /// search-time factors. + pub(crate) id: Option<LazyStateID>, + /// The position of the search. + /// + /// When `id` is None (i.e., we are starting a search), this is set to + /// the beginning of the search as given by the caller regardless of its + /// current value. Subsequent calls to an overlapping search pick up at + /// this offset. + pub(crate) at: usize, + /// The index into the matching patterns of the next match to report if the + /// current state is a match state. Note that this may be 1 greater than + /// the total number of matches to report for the current match state. (In + /// which case, no more matches should be reported at the current position + /// and the search should advance to the next position.) + pub(crate) next_match_index: Option<usize>, + /// This is set to true when a reverse overlapping search has entered its + /// EOI transitions. + /// + /// This isn't used in a forward search because it knows to stop once the + /// position exceeds the end of the search range. In a reverse search, + /// since we use unsigned offsets, we don't "know" once we've gone past + /// `0`. So the only way to detect it is with this extra flag. The reverse + /// overlapping search knows to terminate specifically after it has + /// reported all matches after following the EOI transition. + pub(crate) rev_eoi: bool, +} + +impl OverlappingState { + /// Create a new overlapping state that begins at the start state of any + /// automaton. + pub fn start() -> OverlappingState { + OverlappingState { + mat: None, + id: None, + at: 0, + next_match_index: None, + rev_eoi: false, + } + } + + /// Return the match result of the most recent search to execute with this + /// state. + /// + /// A searches will clear this result automatically, such that if no + /// match is found, this will correctly report `None`. + pub fn get_match(&self) -> Option<HalfMatch> { + self.mat + } +} + +/// Runs the given overlapping `search` function (forwards or backwards) until +/// a match is found whose offset does not split a codepoint. +/// +/// This is *not* always correct to call. It should only be called when the +/// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width +/// matches. Calling this when both of those things aren't true might result +/// in legitimate matches getting skipped. +#[cold] +#[inline(never)] +fn skip_empty_utf8_splits_overlapping<F>( + input: &Input<'_>, + state: &mut OverlappingState, + mut search: F, +) -> Result<(), MatchError> +where + F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>, +{ + // Note that this routine works for forwards and reverse searches + // even though there's no code here to handle those cases. That's + // because overlapping searches drive themselves to completion via + // `OverlappingState`. So all we have to do is push it until no matches are + // found. + + let mut hm = match state.get_match() { + None => return Ok(()), + Some(hm) => hm, + }; + if input.get_anchored().is_anchored() { + if !input.is_char_boundary(hm.offset()) { + state.mat = None; + } + return Ok(()); + } + while !input.is_char_boundary(hm.offset()) { + search(input, state)?; + hm = match state.get_match() { + None => return Ok(()), + Some(hm) => hm, + }; + } + Ok(()) +} + /// Based on the minimum number of states required for a useful lazy DFA cache, /// this returns the minimum lazy state ID that must be representable. /// -/// It's likely not plausible for this to impose constraints on 32-bit systems -/// (or higher), but on 16-bit systems, the lazy state ID space is quite -/// constrained and thus may be insufficient for bigger regexes. +/// It's not likely for this to have any impact 32-bit systems (or higher), but +/// on 16-bit systems, the lazy state ID space is quite constrained and thus +/// may be insufficient if our MIN_STATES value is (for some reason) too high. fn minimum_lazy_state_id( - nfa: &thompson::NFA, classes: &ByteClasses, ) -> Result<LazyStateID, LazyStateIDError> { let stride = 1 << classes.stride2(); @@ -3774,37 +4262,71 @@ fn minimum_lazy_state_id( /// than what is required in practice. Computing the true minimum effectively /// requires determinization, which is probably too much work to do for a /// simple check like this. +/// +/// One of the issues with this approach IMO is that it requires that this +/// be in sync with the calculation above for computing how much heap memory +/// the DFA cache uses. If we get it wrong, it's possible for example for the +/// minimum to be smaller than the computed heap memory, and thus, it may be +/// the case that we can't add the required minimum number of states. That in +/// turn will make lazy DFA panic because we assume that we can add at least a +/// minimum number of states. +/// +/// Another approach would be to always allow the minimum number of states to +/// be added to the lazy DFA cache, even if it exceeds the configured cache +/// limit. This does mean that the limit isn't really a limit in all cases, +/// which is unfortunate. But it does at least guarantee that the lazy DFA can +/// always make progress, even if it is slow. (This approach is very similar to +/// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't +/// rely on cache size calculation. Instead, it would just always permit a +/// minimum number of states to be added.) fn minimum_cache_capacity( nfa: &thompson::NFA, classes: &ByteClasses, starts_for_each_pattern: bool, ) -> usize { const ID_SIZE: usize = size_of::<LazyStateID>(); - let stride = 1 << classes.stride2(); + const STATE_SIZE: usize = size_of::<State>(); - let sparses = 2 * nfa.len() * NFAStateID::SIZE; + let stride = 1 << classes.stride2(); + let states_len = nfa.states().len(); + let sparses = 2 * states_len * NFAStateID::SIZE; let trans = MIN_STATES * stride * ID_SIZE; - let mut starts = Start::count() * ID_SIZE; + let mut starts = Start::len() * ID_SIZE; if starts_for_each_pattern { - starts += (Start::count() * nfa.pattern_len()) * ID_SIZE; + starts += (Start::len() * nfa.pattern_len()) * ID_SIZE; } - // Every `State` has three bytes for flags, 4 bytes (max) for the number - // of patterns, followed by 32-bit encodings of patterns and then delta + // The min number of states HAS to be at least 4: we have 3 sentinel states + // and then we need space for one more when we save a state after clearing + // the cache. We also need space for one more, otherwise we get stuck in a + // loop where we try to add a 5th state, which gets rejected, which clears + // the cache, which adds back a saved state (4th total state) which then + // tries to add the 5th state again. + assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5"); + // The minimum number of non-sentinel states. We consider this separately + // because sentinel states are much smaller in that they contain no NFA + // states. Given our aggressive calculation here, it's worth being more + // precise with the number of states we need. + let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap(); + + // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of + // patterns, followed by 32-bit encodings of patterns and then delta // varint encodings of NFA state IDs. We use the worst case (which isn't // technically possible) of 5 bytes for each NFA state ID. // // HOWEVER, three of the states needed by a lazy DFA are just the sentinel // unknown, dead and quit states. Those states have a known size and it is // small. - assert!(MIN_STATES >= 3, "minimum number of states has to be at least 3"); let dead_state_size = State::dead().memory_usage(); - let max_state_size = 3 + 4 + (nfa.pattern_len() * 4) + (nfa.len() * 5); - let states = (3 * (size_of::<State>() + dead_state_size)) - + ((MIN_STATES - 3) * (size_of::<State>() + max_state_size)); - let states_to_sid = states + (MIN_STATES * ID_SIZE); - let stack = nfa.len() * NFAStateID::SIZE; + let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5); + let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size)) + + (non_sentinel * (STATE_SIZE + max_state_size)); + // NOTE: We don't double count heap memory used by State for this map since + // we use reference counting to avoid doubling memory usage. (This tends to + // be where most memory is allocated in the cache.) + let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE); + let stack = states_len * NFAStateID::SIZE; let scratch_state_builder = max_state_size; trans @@ -3815,3 +4337,45 @@ fn minimum_cache_capacity( + stack + scratch_state_builder } + +#[cfg(all(test, feature = "syntax"))] +mod tests { + use super::*; + + // Tests that we handle heuristic Unicode word boundary support in reverse + // DFAs in the specific case of contextual searches. + // + // I wrote this test when I discovered a bug in how heuristic word + // boundaries were handled. Namely, that the starting state selection + // didn't consider the DFA's quit byte set when looking at the byte + // immediately before the start of the search (or immediately after the + // end of the search in the case of a reverse search). As a result, it was + // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte + // in the 'β' codepoint would be treated as a non-word character. But of + // course, this search should trigger the DFA to quit, since there is a + // non-ASCII byte in consideration. + // + // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set + // if it wasn't empty. The forward case is tested in the doc test for the + // Config::unicode_word_boundary API. We test the reverse case here, which + // is sufficiently niche that it doesn't really belong in a doc test. + #[test] + fn heuristic_unicode_reverse() { + let dfa = DFA::builder() + .configure(DFA::config().unicode_word_boundary(true)) + .thompson(thompson::Config::new().reverse(true)) + .build(r"\b[0-9]+\b") + .unwrap(); + let mut cache = dfa.create_cache(); + + let input = Input::new("β123").range(2..); + let expected = MatchError::quit(0xB2, 1); + let got = dfa.try_search_rev(&mut cache, &input); + assert_eq!(Err(expected), got); + + let input = Input::new("123β").range(..3); + let expected = MatchError::quit(0xCE, 3); + let got = dfa.try_search_rev(&mut cache, &input); + assert_eq!(Err(expected), got); + } +} |