diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:44 +0000 |
commit | c23a457e72abe608715ac76f076f47dc42af07a5 (patch) | |
tree | 2772049aaf84b5c9d0ed12ec8d86812f7a7904b6 /vendor/regex-automata-0.2.0/src/hybrid/regex.rs | |
parent | Releasing progress-linux version 1.73.0+dfsg1-1~progress7.99u1. (diff) | |
download | rustc-c23a457e72abe608715ac76f076f47dc42af07a5.tar.xz rustc-c23a457e72abe608715ac76f076f47dc42af07a5.zip |
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata-0.2.0/src/hybrid/regex.rs')
-rw-r--r-- | vendor/regex-automata-0.2.0/src/hybrid/regex.rs | 2124 |
1 files changed, 2124 insertions, 0 deletions
diff --git a/vendor/regex-automata-0.2.0/src/hybrid/regex.rs b/vendor/regex-automata-0.2.0/src/hybrid/regex.rs new file mode 100644 index 000000000..7cc6b9064 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/hybrid/regex.rs @@ -0,0 +1,2124 @@ +/*! +A lazy DFA backed `Regex`. + +This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience +routines you might have come to expect, such as finding a match and iterating +over all non-overlapping matches. This `Regex` type is limited in its +capabilities to what a lazy DFA can provide. Therefore, APIs involving +capturing groups, for example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::hybrid) for examples. +*/ + +use core::borrow::Borrow; + +use alloc::boxed::Box; + +use crate::{ + hybrid::{ + dfa::{self, DFA}, + error::BuildError, + OverlappingState, + }, + nfa::thompson, + util::{ + matchtypes::{MatchError, MatchKind, MultiMatch}, + prefilter::{self, Prefilter}, + }, +}; + +/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs") +/// for searching. +/// +/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a +/// "reverse" DFA. The forward DFA is responsible for detecting the end of +/// a match while the reverse DFA is responsible for detecting the start +/// of a match. Thus, in order to find the bounds of any given match, a +/// forward search must first be run followed by a reverse search. A match +/// found by the forward DFA guarantees that the reverse DFA will also find +/// a match. +/// +/// A `Regex` can also have a prefilter set via the +/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is +/// enabled. +/// +/// # Earliest vs Leftmost vs Overlapping +/// +/// The search routines exposed on a `Regex` reflect three different ways +/// of searching: +/// +/// * "earliest" means to stop as soon as a match has been detected. +/// * "leftmost" means to continue matching until the underlying +/// automaton cannot advance. This reflects "standard" searching you +/// might be used to in other regex engines. e.g., This permits +/// non-greedy and greedy searching to work as you would expect. +/// * "overlapping" means to find all possible matches, even if they +/// overlap. +/// +/// Generally speaking, when doing an overlapping search, you'll want to +/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using +/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is +/// likely to lead to odd behavior since `LeftmostFirst` specifically omits +/// some matches that can never be reported due to its semantics. +/// +/// The following example shows the differences between how these different +/// types of searches impact looking for matches of `[a-z]+` in the +/// haystack `abc`. +/// +/// ``` +/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch}; +/// +/// let pattern = r"[a-z]+"; +/// let haystack = "abc".as_bytes(); +/// +/// // With leftmost-first semantics, we test "earliest" and "leftmost". +/// let re = regex::Builder::new() +/// .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst)) +/// .build(pattern)?; +/// let mut cache = re.create_cache(); +/// +/// // "earliest" searching isn't impacted by greediness +/// let mut it = re.find_earliest_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// // "leftmost" searching supports greediness (and non-greediness) +/// let mut it = re.find_leftmost_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// // For overlapping, we want "all" match kind semantics. +/// let re = regex::Builder::new() +/// .dfa(dfa::Config::new().match_kind(MatchKind::All)) +/// .build(pattern)?; +/// let mut cache = re.create_cache(); +/// +/// // In the overlapping search, we find all three possible matches +/// // starting at the beginning of the haystack. +/// let mut it = re.find_overlapping_iter(&mut cache, haystack); +/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); +/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); +/// assert_eq!(None, it.next()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Fallibility +/// +/// In non-default configurations, the lazy DFAs generated in this module may +/// return an error during a search. (Currently, the only way this happens is +/// if quit bytes are added, Unicode word boundaries are heuristically enabled, +/// or if the cache is configured to "give up" on a search if it has been +/// cleared too many times. All of these are turned off by default, which means +/// a search can never fail in the default configuration.) For convenience, +/// the main search routines, like [`find_leftmost`](Regex::find_leftmost), +/// will panic if an error occurs. However, if you need to use DFAs which may +/// produce an error at search time, then there are fallible equivalents of +/// all search routines. For example, for `find_leftmost`, its fallible analog +/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed +/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the +/// infallible routines simply return `Option<MultiMatch>`. +/// +/// # Example +/// +/// This example shows how to cause a search to terminate if it sees a +/// `\n` byte, and handle the error returned. This could be useful if, for +/// example, you wanted to prevent a user supplied pattern from matching +/// across a line boundary. +/// +/// ``` +/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError}; +/// +/// let re = Regex::builder() +/// .dfa(dfa::Config::new().quit(b'\n', true)) +/// .build(r"foo\p{any}+bar")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = "foo\nbar".as_bytes(); +/// // Normally this would produce a match, since \p{any} contains '\n'. +/// // But since we instructed the automaton to enter a quit state if a +/// // '\n' is observed, this produces a match error instead. +/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; +/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err(); +/// assert_eq!(expected, got); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Debug)] +pub struct Regex { + /// An optional prefilter that is passed down to the lazy DFA search + /// routines when present. By default, no prefilter is set. + pre: Option<Box<dyn Prefilter>>, + /// The forward lazy DFA. This can only find the end of a match. + forward: DFA, + /// The reverse lazy DFA. This can only find the start of a match. + /// + /// This is built with 'all' match semantics (instead of leftmost-first) + /// so that it always finds the longest possible match (which corresponds + /// to the leftmost starting position). It is also compiled as an anchored + /// matcher and has 'starts_for_each_pattern' enabled. Including starting + /// states for each pattern is necessary to ensure that we only look for + /// matches of a pattern that matched in the forward direction. Otherwise, + /// we might wind up finding the "leftmost" starting position of a totally + /// different pattern! + reverse: DFA, + /// Whether iterators on this type should advance by one codepoint or one + /// byte when an empty match is seen. + utf8: bool, +} + +/// Convenience routines for regex and cache construction. +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new(pattern: &str) -> Result<Regex, BuildError> { + Regex::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.find_leftmost_iter( + /// &mut cache, + /// b"abc 1 foo 4567 0 quux", + /// ); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<Regex, BuildError> { + Regex::builder().build_many(patterns) + } + + /// Return a default configuration for a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a regex. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode for `Regex` iteration. + /// When UTF-8 mode is disabled, the position immediately following an + /// empty match is where the next search begins, instead of the next + /// position of a UTF-8 encoded codepoint. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::regex::Regex, + /// nfa::thompson, + /// MultiMatch, SyntaxConfig, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .syntax(SyntaxConfig::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(MultiMatch::must(0, 1, 9)); + /// let got = re.find_leftmost(&mut cache, haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this `Regex`. + /// + /// The cache returned should only be used for searches for this + /// `Regex`. If you want to reuse the cache for another `Regex`, then + /// you must call [`Cache::reset`] with that `Regex` (or, equivalently, + /// [`Regex::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Reset the given cache such that it can be used for searching with the + /// this `Regex` (and only this `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 2)), + /// re1.find_leftmost(&mut cache, "Δ".as_bytes()), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re2.find_leftmost(&mut cache, "☃".as_bytes()), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + self.forward().reset_cache(&mut cache.forward); + self.reverse().reset_cache(&mut cache.reverse); + } +} + +/// Standard infallible search routines for finding and iterating over matches. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_is_match`](Regex::try_is_match). + /// + /// # Example + /// + /// ``` + /// use regex_automata::hybrid::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar")); + /// assert_eq!(false, re.is_match(&mut cache, b"foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool { + self.try_is_match(cache, haystack).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest`](Regex::try_find_earliest). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// // Normally, the leftmost first match would greedily consume as many + /// // decimal digits as it could. But a match is detected as soon as one + /// // digit is seen. + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 4)), + /// re.find_earliest(&mut cache, b"foo12345"), + /// ); + /// + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the "earliest" match semantics detect a match earlier. + /// let re = Regex::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 1)), + /// re.find_earliest(&mut cache, b"abc"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_earliest( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Option<MultiMatch> { + self.try_find_earliest(cache, haystack).unwrap() + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// // Greediness is applied appropriately when compared to find_earliest. + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 11)), + /// re.find_leftmost(&mut cache, b"zzzfoo12345zzz"), + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re.find_leftmost(&mut cache, b"abc"), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_leftmost( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Option<MultiMatch> { + self.try_find_leftmost(cache, haystack).unwrap() + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping`](Regex::try_find_overlapping). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex, OverlappingState}, + /// MatchKind, + /// MultiMatch, + /// }; + /// + /// let re = Regex::builder() + /// .dfa(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "@foo".as_bytes(); + /// let mut state = OverlappingState::start(); + /// + /// let expected = Some(MultiMatch::must(1, 0, 4)); + /// let got = re.find_overlapping(&mut cache, haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(MultiMatch::must(0, 1, 4)); + /// let got = re.find_overlapping(&mut cache, haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_overlapping( + &self, + cache: &mut Cache, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Option<MultiMatch> { + self.try_find_overlapping(cache, haystack, state).unwrap() + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). + /// + /// # Example + /// + /// This example shows how to run an "earliest" iterator. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::new("[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let haystack = "123".as_bytes(); + /// + /// // Normally, a standard leftmost iterator would return a single + /// // match, but since "earliest" detects matches earlier, we get + /// // three matches. + /// let mut it = re.find_earliest_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_earliest_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindEarliestMatches<'r, 'c, 't> { + FindEarliestMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = b"foo1 foo12 foo123"; + /// let matches: Vec<MultiMatch> = re + /// .find_leftmost_iter(&mut cache, text) + /// .collect(); + /// assert_eq!(matches, vec![ + /// MultiMatch::must(0, 0, 4), + /// MultiMatch::must(0, 5, 10), + /// MultiMatch::must(0, 11, 17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_leftmost_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex}, + /// MatchKind, + /// MultiMatch, + /// }; + /// + /// let re = Regex::builder() + /// .dfa(DFA::config().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let mut cache = re.create_cache(); + /// let haystack = "@foo".as_bytes(); + /// + /// let mut it = re.find_overlapping_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn find_overlapping_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> FindOverlappingMatches<'r, 'c, 't> { + FindOverlappingMatches::new(self, cache, haystack) + } +} + +/// Lower level infallible search routines that permit controlling where +/// the search starts and ends in a particular sequence. This is useful for +/// executing searches that need to take surrounding context into account. This +/// is required for correctly implementing iteration because of look-around +/// operators (`^`, `$`, `\b`). +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_is_match_at`](Regex::try_is_match_at). + pub fn is_match_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> bool { + self.try_is_match_at(cache, haystack, start, end).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_at`](Regex::try_find_earliest_at). + pub fn find_earliest_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option<MultiMatch> { + self.try_find_earliest_at(cache, haystack, start, end).unwrap() + } + + /// Returns the same as `find_leftmost`, but starts the search at the given + /// offset. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches within the + /// same haystack, which cannot be done correctly by simply providing a + /// subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). + pub fn find_leftmost_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option<MultiMatch> { + self.try_find_leftmost_at(cache, haystack, start, end).unwrap() + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying lazy DFAs return an error, then this routine panics. + /// This only occurs in non-default configurations where quit bytes are + /// used, Unicode word boundaries are heuristically enabled or limits are + /// set on the number of times the lazy DFA's cache may be cleared. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). + pub fn find_overlapping_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Option<MultiMatch> { + self.try_find_overlapping_at(cache, haystack, start, end, state) + .unwrap() + } +} + +/// Fallible search routines. These may return an error when the underlying +/// lazy DFAs have been configured in a way that permits them to fail during a +/// search. +/// +/// Errors during search only occur when the lazy DFA has been explicitly +/// configured to do so, usually by specifying one or more "quit" bytes or by +/// heuristically enabling Unicode word boundaries. +/// +/// Errors will never be returned using the default configuration. So these +/// fallible routines are only needed for particular configurations. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match`](Regex::is_match). + pub fn try_is_match( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<bool, MatchError> { + self.try_is_match_at(cache, haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest`](Regex::find_earliest). + pub fn try_find_earliest( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_earliest_at(cache, haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost`](Regex::find_leftmost). + pub fn try_find_leftmost( + &self, + cache: &mut Cache, + haystack: &[u8], + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_leftmost_at(cache, haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping`](Regex::find_overlapping). + pub fn try_find_overlapping( + &self, + cache: &mut Cache, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_iter`](Regex::find_earliest_iter). + pub fn try_find_earliest_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindEarliestMatches<'r, 'c, 't> { + TryFindEarliestMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_iter`](Regex::find_leftmost_iter). + pub fn try_find_leftmost_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 'c, 't> { + TryFindLeftmostMatches::new(self, cache, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_iter`](Regex::find_overlapping_iter). + pub fn try_find_overlapping_iter<'r, 'c, 't>( + &'r self, + cache: &'c mut Cache, + haystack: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 'c, 't> { + TryFindOverlappingMatches::new(self, cache, haystack) + } +} + +/// Lower level fallible search routines that permit controlling where the +/// search starts and ends in a particular sequence. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match_at`](Regex::is_match_at). + pub fn try_is_match_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<bool, MatchError> { + self.forward() + .find_leftmost_fwd_at( + &mut cache.forward, + self.scanner().as_mut(), + None, + haystack, + start, + end, + ) + .map(|x| x.is_some()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_at`](Regex::find_earliest_at). + pub fn try_find_earliest_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_earliest_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + ) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_at`](Regex::find_leftmost_at). + pub fn try_find_leftmost_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_leftmost_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + ) + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_at`](Regex::find_overlapping_at). + pub fn try_find_overlapping_at( + &self, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + self.try_find_overlapping_at_imp( + self.scanner().as_mut(), + cache, + haystack, + start, + end, + state, + ) + } +} + +impl Regex { + #[inline(always)] + fn try_find_earliest_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa + .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // earliest case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = rdfa + .find_earliest_rev_at(rcache, None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + #[inline(always)] + fn try_find_leftmost_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa + .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = rdfa + .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + #[inline(always)] + fn try_find_overlapping_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + cache: &mut Cache, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result<Option<MultiMatch>, MatchError> { + let (fdfa, rdfa) = (self.forward(), self.reverse()); + let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); + let end = match fdfa.find_overlapping_fwd_at( + fcache, pre, None, haystack, start, end, state, + )? { + None => return Ok(None), + Some(end) => end, + }; + // Unlike the leftmost cases, the reverse overlapping search may match + // a different pattern than the forward search. See test failures when + // using `None` instead of `Some(end.pattern())` below. Thus, we must + // run our reverse search using the pattern that matched in the forward + // direction. + let start = rdfa + .find_leftmost_rev_at( + rcache, + Some(end.pattern()), + haystack, + 0, + end.offset(), + )? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl Regex { + /// Return the underlying lazy DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn forward(&self) -> &DFA { + &self.forward + } + + /// Return the underlying lazy DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying lazy DFA and using it + /// directly if the situation calls for it. + pub fn reverse(&self) -> &DFA { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_count()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_count(&self) -> usize { + assert_eq!( + self.forward().pattern_count(), + self.reverse().pattern_count() + ); + self.forward().pattern_count() + } + + /// Convenience function for returning this regex's prefilter as a trait + /// object. + /// + /// If this regex doesn't have a prefilter, then `None` is returned. + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + self.pre.as_ref().map(|x| &**x) + } + + /// Attach the given prefilter to this regex. + pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) { + self.pre = pre; + } + + /// Convenience function for returning a prefilter scanner. + fn scanner(&self) -> Option<prefilter::Scanner> { + self.prefilter().map(prefilter::Scanner::new) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>); + +impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindEarliestMatches<'r, 'c, 't> { + FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>); + +impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindLeftmostMatches<'r, 'c, 't> { + FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all overlapping matches for a particular infallible +/// search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct FindOverlappingMatches<'r, 'c, 't>( + TryFindOverlappingMatches<'r, 'c, 't>, +); + +impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> FindOverlappingMatches<'r, 'c, 't> { + FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text)) + } +} + +impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> { + type Item = MultiMatch; + + fn next(&mut self) -> Option<MultiMatch> { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindEarliestMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindEarliestMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindEarliestMatches { + re, + cache, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_earliest_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindLeftmostMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + last_match: Option<usize>, +} + +impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindLeftmostMatches { + re, + cache, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_leftmost_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all overlapping matches for a particular fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// The lifetime variables are as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'c` is the lifetime of the mutable cache used during search. +/// * `'t` is the lifetime of the text being searched. +#[derive(Debug)] +pub struct TryFindOverlappingMatches<'r, 'c, 't> { + re: &'r Regex, + cache: &'c mut Cache, + scanner: Option<prefilter::Scanner<'r>>, + text: &'t [u8], + last_end: usize, + state: OverlappingState, +} + +impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> { + fn new( + re: &'r Regex, + cache: &'c mut Cache, + text: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 'c, 't> { + let scanner = re.scanner(); + TryFindOverlappingMatches { + re, + cache, + scanner, + text, + last_end: 0, + state: OverlappingState::start(), + } + } +} + +impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> { + type Item = Result<MultiMatch, MatchError>; + + fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_overlapping_at_imp( + self.scanner.as_mut(), + self.cache, + self.text, + self.last_end, + self.text.len(), + &mut self.state, + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + // Unlike the non-overlapping case, we're OK with empty matches at this + // level. In particular, the overlapping search algorithm is itself + // responsible for ensuring that progress is always made. + self.last_end = m.end(); + Some(Ok(m)) + } +} + +/// A cache represents a partially computed forward and reverse DFA. +/// +/// A cache is the key component that differentiates a classical DFA and a +/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a +/// complete transition table that can handle all possible inputs, a hybrid +/// NFA/DFA starts with an empty transition table and builds only the parts +/// required during search. The parts that are built are stored in a cache. For +/// this reason, a cache is a required parameter for nearly every operation on +/// a [`Regex`]. +/// +/// Caches can be created from their corresponding `Regex` via +/// [`Regex::create_cache`]. A cache can only be used with either the `Regex` +/// that created it, or the `Regex` that was most recently used to reset it +/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in +/// panics or incorrect results. +#[derive(Debug, Clone)] +pub struct Cache { + forward: dfa::Cache, + reverse: dfa::Cache, +} + +impl Cache { + /// Create a new cache for the given `Regex`. + /// + /// The cache returned should only be used for searches for the given + /// `Regex`. If you want to reuse the cache for another `Regex`, then you + /// must call [`Cache::reset`] with that `Regex`. + pub fn new(re: &Regex) -> Cache { + let forward = dfa::Cache::new(re.forward()); + let reverse = dfa::Cache::new(re.reverse()); + Cache { forward, reverse } + } + + /// Reset this cache such that it can be used for searching with the given + /// `Regex` (and only that `Regex`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `Regex`. + /// + /// Resetting a cache sets its "clear count" to 0. This is relevant if the + /// `Regex` has been configured to "give up" after it has cleared the cache + /// a certain number of times. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `Regex`. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re1 = Regex::new(r"\w")?; + /// let re2 = Regex::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 2)), + /// re1.find_leftmost(&mut cache, "Δ".as_bytes()), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the Regex we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 3)), + /// re2.find_leftmost(&mut cache, "☃".as_bytes()), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &Regex) { + self.forward.reset(re.forward()); + self.reverse.reset(re.reverse()); + } + + /// Returns the heap memory usage, in bytes, as a sum of the forward and + /// reverse lazy DFA caches. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.forward.memory_usage() + self.reverse.memory_usage() + } + + /// Return references to the forward and reverse caches, respectively. + pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) { + (&self.forward, &self.reverse) + } + + /// Return mutable references to the forward and reverse caches, + /// respectively. + pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) { + (&mut self.forward, &mut self.reverse) + } +} + +/// The configuration used for compiling a hybrid NFA/DFA regex. +/// +/// A regex configuration is a simple data object that is typically used with +/// [`Builder::configure`]. +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + utf8: Option<bool>, +} + +impl Config { + /// Return a new default regex compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Whether to enable UTF-8 mode or not. + /// + /// When UTF-8 mode is enabled (the default) and an empty match is seen, + /// the iterators on [`Regex`] will always start the next search at the + /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 + /// mode is disabled, such searches are begun at the next byte offset. + /// + /// If this mode is enabled and invalid UTF-8 is given to search, then + /// behavior is unspecified. + /// + /// Generally speaking, one should enable this when + /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) + /// and + /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) + /// are enabled, and disable it otherwise. + /// + /// # Example + /// + /// This example demonstrates the differences between when this option is + /// enabled and disabled. The differences only arise when the regex can + /// return matches of length zero. + /// + /// In this first snippet, we show the results when UTF-8 mode is disabled. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And in this snippet, we execute the same search on the same haystack, + /// but with UTF-8 mode enabled. Notice that byte offsets that would + /// otherwise split the encoding of `☃` are not returned. + /// + /// ``` + /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(true)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(&mut cache, haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Returns true if and only if this configuration has UTF-8 mode enabled. + /// + /// When UTF-8 mode is enabled and an empty match is seen, the iterators on + /// [`Regex`] will always start the next search at the next UTF-8 encoded + /// codepoint. When UTF-8 mode is disabled, such searches are begun at the + /// next byte offset. + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { utf8: o.utf8.or(self.utf8) } + } +} + +/// A builder for a regex based on a hybrid NFA/DFA. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the lazy DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder +/// in that it permits fine grain configuration of the construction process. +/// The trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are three +/// different UTF-8 modes: +/// +/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the +/// pattern itself can contain sub-expressions that match invalid UTF-8. +/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) +/// controls whether the implicit unanchored prefix added to the NFA can +/// match through invalid UTF-8 or not. +/// * [`Config::utf8`] controls how the regex iterators themselves advance +/// the starting position of the next search when a match with zero length is +/// found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two hybrid NFA/DFAs, +/// where one is responsible for finding the end of a match and the other is +/// responsible for finding the start of a match. If you only need to detect +/// whether something matched, or only the end of a match, then you should use +/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper +/// than building two of them. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax, the NFA and +/// the regex itself. This is generally what you want for matching on +/// arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// }; +/// +/// let re = Regex::builder() +/// .configure(Regex::config().utf8(false)) +/// .syntax(SyntaxConfig::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(MultiMatch::must(0, 1, 9)); +/// let got = re.find_leftmost(&mut cache, haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. Notice also that the +/// // search was unanchored and skipped over invalid UTF-8. +/// // Disabling UTF-8 on the Thompson NFA permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + dfa: dfa::Builder, +} + +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { config: Config::default(), dfa: DFA::builder() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a regex from the given patterns. + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<Regex, BuildError> { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + DFA::config() + .anchored(true) + .match_kind(MatchKind::All) + .starts_for_each_pattern(true), + ) + .thompson(thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse hybrid NFA/DFAs. + fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { + // The congruous method on DFA-backed regexes is exposed, but it's + // not clear this builder is useful here since lazy DFAs can't be + // serialized and there is only one type of them. + let utf8 = self.config.get_utf8(); + Regex { pre: None, forward, reverse, utf8 } + } + + /// Apply the given regex configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the lazy DFA compilation configuration for this builder using + /// [`dfa::Config`](dfa::Config). + /// + /// This permits setting things like whether Unicode word boundaries should + /// be heuristically supported or settings how the behavior of the cache. + pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +#[inline(always)] +fn next_unwrap( + item: Option<Result<MultiMatch, MatchError>>, +) -> Option<MultiMatch> { + match item { + None => None, + Some(Ok(m)) => Some(m), + Some(Err(err)) => panic!( + "unexpected regex search error: {}\n\ + to handle search errors, use try_ methods", + err, + ), + } +} |