diff options
Diffstat (limited to 'vendor/regex-automata/src/hybrid/regex.rs')
-rw-r--r-- | vendor/regex-automata/src/hybrid/regex.rs | 1925 |
1 files changed, 348 insertions, 1577 deletions
diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs index 7cc6b9064..75667daf9 100644 --- a/vendor/regex-automata/src/hybrid/regex.rs +++ b/vendor/regex-automata/src/hybrid/regex.rs @@ -1,10 +1,10 @@ /*! A lazy DFA backed `Regex`. -This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience -routines you might have come to expect, such as finding a match and iterating -over all non-overlapping matches. This `Regex` type is limited in its -capabilities to what a lazy DFA can provide. Therefore, APIs involving +This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements +convenience routines you might have come to expect, such as finding a match +and iterating over all non-overlapping matches. This `Regex` type is limited +in its capabilities to what a lazy DFA can provide. Therefore, APIs involving capturing groups, for example, are not provided. Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that @@ -14,20 +14,15 @@ find the start offset of a match. See the [parent module](crate::hybrid) for examples. */ -use core::borrow::Borrow; - -use alloc::boxed::Box; - use crate::{ hybrid::{ dfa::{self, DFA}, error::BuildError, - OverlappingState, }, nfa::thompson, util::{ - matchtypes::{MatchError, MatchKind, MultiMatch}, - prefilter::{self, Prefilter}, + iter, + search::{Anchored, Input, Match, MatchError, MatchKind}, }, }; @@ -42,89 +37,20 @@ use crate::{ /// found by the forward DFA guarantees that the reverse DFA will also find /// a match. /// -/// A `Regex` can also have a prefilter set via the -/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is -/// enabled. -/// -/// # Earliest vs Leftmost vs Overlapping -/// -/// The search routines exposed on a `Regex` reflect three different ways -/// of searching: -/// -/// * "earliest" means to stop as soon as a match has been detected. -/// * "leftmost" means to continue matching until the underlying -/// automaton cannot advance. This reflects "standard" searching you -/// might be used to in other regex engines. e.g., This permits -/// non-greedy and greedy searching to work as you would expect. -/// * "overlapping" means to find all possible matches, even if they -/// overlap. -/// -/// Generally speaking, when doing an overlapping search, you'll want to -/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using -/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is -/// likely to lead to odd behavior since `LeftmostFirst` specifically omits -/// some matches that can never be reported due to its semantics. -/// -/// The following example shows the differences between how these different -/// types of searches impact looking for matches of `[a-z]+` in the -/// haystack `abc`. -/// -/// ``` -/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch}; -/// -/// let pattern = r"[a-z]+"; -/// let haystack = "abc".as_bytes(); -/// -/// // With leftmost-first semantics, we test "earliest" and "leftmost". -/// let re = regex::Builder::new() -/// .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst)) -/// .build(pattern)?; -/// let mut cache = re.create_cache(); -/// -/// // "earliest" searching isn't impacted by greediness -/// let mut it = re.find_earliest_iter(&mut cache, haystack); -/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); -/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); -/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); -/// assert_eq!(None, it.next()); -/// -/// // "leftmost" searching supports greediness (and non-greediness) -/// let mut it = re.find_leftmost_iter(&mut cache, haystack); -/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); -/// assert_eq!(None, it.next()); -/// -/// // For overlapping, we want "all" match kind semantics. -/// let re = regex::Builder::new() -/// .dfa(dfa::Config::new().match_kind(MatchKind::All)) -/// .build(pattern)?; -/// let mut cache = re.create_cache(); -/// -/// // In the overlapping search, we find all three possible matches -/// // starting at the beginning of the haystack. -/// let mut it = re.find_overlapping_iter(&mut cache, haystack); -/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); -/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); -/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); -/// assert_eq!(None, it.next()); -/// -/// # Ok::<(), Box<dyn std::error::Error>>(()) -/// ``` -/// /// # Fallibility /// -/// In non-default configurations, the lazy DFAs generated in this module may -/// return an error during a search. (Currently, the only way this happens is -/// if quit bytes are added, Unicode word boundaries are heuristically enabled, -/// or if the cache is configured to "give up" on a search if it has been -/// cleared too many times. All of these are turned off by default, which means -/// a search can never fail in the default configuration.) For convenience, -/// the main search routines, like [`find_leftmost`](Regex::find_leftmost), -/// will panic if an error occurs. However, if you need to use DFAs which may -/// produce an error at search time, then there are fallible equivalents of -/// all search routines. For example, for `find_leftmost`, its fallible analog -/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed -/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the -/// infallible routines simply return `Option<MultiMatch>`. +/// Most of the search routines defined on this type will _panic_ when the +/// underlying search fails. This might be because the DFA gave up because it +/// saw a quit byte, whether configured explicitly or via heuristic Unicode +/// word boundary support, although neither are enabled by default. It might +/// also fail if the underlying DFA determines it isn't making effective use of +/// the cache (which also never happens by default). Or it might fail because +/// an invalid `Input` configuration is given, for example, with an unsupported +/// [`Anchored`] mode. +/// +/// If you need to handle these error cases instead of allowing them to trigger +/// a panic, then the lower level [`Regex::try_search`] provides a fallible API +/// that never panics. /// /// # Example /// @@ -134,28 +60,26 @@ use crate::{ /// across a line boundary. /// /// ``` -/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError}; +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError}; /// /// let re = Regex::builder() /// .dfa(dfa::Config::new().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// let mut cache = re.create_cache(); /// -/// let haystack = "foo\nbar".as_bytes(); +/// let input = Input::new("foo\nbar"); /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. -/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; -/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err(); +/// let expected = MatchError::quit(b'\n', 3); +/// let got = re.try_search(&mut cache, &input).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[derive(Debug)] pub struct Regex { - /// An optional prefilter that is passed down to the lazy DFA search - /// routines when present. By default, no prefilter is set. - pre: Option<Box<dyn Prefilter>>, /// The forward lazy DFA. This can only find the end of a match. forward: DFA, /// The reverse lazy DFA. This can only find the start of a match. @@ -169,9 +93,6 @@ pub struct Regex { /// we might wind up finding the "leftmost" starting position of a totally /// different pattern! reverse: DFA, - /// Whether iterators on this type should advance by one codepoint or one - /// byte when an empty match is seen. - utf8: bool, } /// Convenience routines for regex and cache construction. @@ -185,86 +106,49 @@ impl Regex { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// assert_eq!( - /// Some(MultiMatch::must(0, 3, 14)), - /// re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"), + /// Some(Match::must(0, 3..14)), + /// re.find(&mut cache, "zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` + #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result<Regex, BuildError> { Regex::builder().build(pattern) } - /// Like `new`, but parses multiple patterns into a single "regex set." + /// Like `new`, but parses multiple patterns into a single "multi regex." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; /// let mut cache = re.create_cache(); /// - /// let mut it = re.find_leftmost_iter( - /// &mut cache, - /// b"abc 1 foo 4567 0 quux", - /// ); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` + #[cfg(feature = "syntax")] pub fn new_many<P: AsRef<str>>( patterns: &[P], ) -> Result<Regex, BuildError> { Regex::builder().build_many(patterns) } - /// Return a default configuration for a `Regex`. - /// - /// This is a convenience routine to avoid needing to import the `Config` - /// type when customizing the construction of a regex. - /// - /// # Example - /// - /// This example shows how to disable UTF-8 mode for `Regex` iteration. - /// When UTF-8 mode is disabled, the position immediately following an - /// empty match is where the next search begins, instead of the next - /// position of a UTF-8 encoded codepoint. - /// - /// ``` - /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .build(r"")?; - /// let mut cache = re.create_cache(); - /// - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(&mut cache, haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn config() -> Config { - Config::new() - } - /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the @@ -276,22 +160,20 @@ impl Regex { /// everywhere. /// /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ - /// hybrid::regex::Regex, - /// nfa::thompson, - /// MultiMatch, SyntaxConfig, + /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .syntax(SyntaxConfig::new().utf8(false)) + /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; - /// let expected = Some(MultiMatch::must(0, 1, 9)); - /// let got = re.find_leftmost(&mut cache, haystack); + /// let expected = Some(Match::must(0, 1..9)); + /// let got = re.find(&mut cache, haystack); /// assert_eq!(expected, got); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -325,15 +207,16 @@ impl Regex { /// This shows how to re-purpose a cache for use with a different `Regex`. /// /// ``` - /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re1 = Regex::new(r"\w")?; /// let re2 = Regex::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 2)), - /// re1.find_leftmost(&mut cache, "Δ".as_bytes()), + /// Some(Match::must(0, 0..2)), + /// re1.find(&mut cache, "Δ"), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or @@ -344,8 +227,8 @@ impl Regex { /// // allowed. /// re2.reset_cache(&mut cache); /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 3)), - /// re2.find_leftmost(&mut cache, "☃".as_bytes()), + /// Some(Match::must(0, 0..3)), + /// re2.find(&mut cache, "☃"), /// ); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -367,78 +250,49 @@ impl Regex { /// /// # Panics /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_is_match`](Regex::try_is_match). - /// - /// # Example - /// - /// ``` - /// use regex_automata::hybrid::regex::Regex; - /// - /// let re = Regex::new("foo[0-9]+bar")?; - /// let mut cache = re.create_cache(); + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: /// - /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar")); - /// assert_eq!(false, re.is_match(&mut cache, b"foobar")); - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool { - self.try_is_match(cache, haystack).unwrap() - } - - /// Returns the first position at which a match is found. + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. + /// When a search panics, callers cannot know whether a match exists or + /// not. /// - /// # Panics - /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_find_earliest`](Regex::try_find_earliest). + /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// use regex_automata::hybrid::regex::Regex; /// - /// // Normally, the leftmost first match would greedily consume as many - /// // decimal digits as it could. But a match is detected as soon as one - /// // digit is seen. - /// let re = Regex::new("foo[0-9]+")?; + /// let re = Regex::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); - /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 4)), - /// re.find_earliest(&mut cache, b"foo12345"), - /// ); /// - /// // Normally, the end of the leftmost first match here would be 3, - /// // but the "earliest" match semantics detect a match earlier. - /// let re = Regex::new("abc|a")?; - /// let mut cache = re.create_cache(); - /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 1)), - /// re.find_earliest(&mut cache, b"abc"), - /// ); + /// assert!(re.is_match(&mut cache, "foo12345bar")); + /// assert!(!re.is_match(&mut cache, "foobar")); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn find_earliest( + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, - haystack: &[u8], - ) -> Option<MultiMatch> { - self.try_find_earliest(cache, haystack).unwrap() + input: I, + ) -> bool { + // Not only can we do an "earliest" search, but we can avoid doing a + // reverse scan too. + self.forward() + .try_search_fwd(&mut cache.forward, &input.into().earliest(true)) + .unwrap() + .is_some() } /// Returns the start and end offset of the leftmost match. If no match @@ -446,25 +300,35 @@ impl Regex { /// /// # Panics /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// The fallible version of this routine is - /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// use regex_automata::{Match, hybrid::regex::Regex}; /// - /// // Greediness is applied appropriately when compared to find_earliest. /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// assert_eq!( - /// Some(MultiMatch::must(0, 3, 11)), - /// re.find_leftmost(&mut cache, b"zzzfoo12345zzz"), + /// Some(Match::must(0, 3..11)), + /// re.find(&mut cache, "zzzfoo12345zzz"), /// ); /// /// // Even though a match is found after reading the first byte (`a`), @@ -473,892 +337,182 @@ impl Regex { /// // parts. /// let re = Regex::new("abc|a")?; /// let mut cache = re.create_cache(); - /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 3)), - /// re.find_leftmost(&mut cache, b"abc"), - /// ); + /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn find_leftmost( + #[inline] + pub fn find<'h, I: Into<Input<'h>>>( &self, cache: &mut Cache, - haystack: &[u8], - ) -> Option<MultiMatch> { - self.try_find_leftmost(cache, haystack).unwrap() + input: I, + ) -> Option<Match> { + self.try_search(cache, &input.into()).unwrap() } - /// Search for the first overlapping match in `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Panics - /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping`](Regex::try_find_overlapping). - /// - /// # Example - /// - /// This example shows how to run an overlapping search with multiple - /// regexes. - /// - /// ``` - /// use regex_automata::{ - /// hybrid::{dfa::DFA, regex::Regex, OverlappingState}, - /// MatchKind, - /// MultiMatch, - /// }; - /// - /// let re = Regex::builder() - /// .dfa(DFA::config().match_kind(MatchKind::All)) - /// .build_many(&[r"\w+$", r"\S+$"])?; - /// let mut cache = re.create_cache(); - /// - /// let haystack = "@foo".as_bytes(); - /// let mut state = OverlappingState::start(); - /// - /// let expected = Some(MultiMatch::must(1, 0, 4)); - /// let got = re.find_overlapping(&mut cache, haystack, &mut state); - /// assert_eq!(expected, got); - /// - /// // The first pattern also matches at the same position, so re-running - /// // the search will yield another match. Notice also that the first - /// // pattern is returned after the second. This is because the second - /// // pattern begins its match before the first, is therefore an earlier - /// // match and is thus reported first. - /// let expected = Some(MultiMatch::must(0, 1, 4)); - /// let got = re.find_overlapping(&mut cache, haystack, &mut state); - /// assert_eq!(expected, got); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn find_overlapping( - &self, - cache: &mut Cache, - haystack: &[u8], - state: &mut OverlappingState, - ) -> Option<MultiMatch> { - self.try_find_overlapping(cache, haystack, state).unwrap() - } - - /// Returns an iterator over all non-overlapping "earliest" matches. - /// - /// Match positions are reported as soon as a match is known to occur, even - /// if the standard leftmost match would be longer. + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. /// /// # Panics /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). - /// - /// # Example - /// - /// This example shows how to run an "earliest" iterator. - /// - /// ``` - /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; - /// - /// let re = Regex::new("[0-9]+")?; - /// let mut cache = re.create_cache(); - /// let haystack = "123".as_bytes(); - /// - /// // Normally, a standard leftmost iterator would return a single - /// // match, but since "earliest" detects matches earlier, we get - /// // three matches. - /// let mut it = re.find_earliest_iter(&mut cache, haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn find_earliest_iter<'r, 'c, 't>( - &'r self, - cache: &'c mut Cache, - haystack: &'t [u8], - ) -> FindEarliestMatches<'r, 'c, 't> { - FindEarliestMatches::new(self, cache, haystack) - } - - /// Returns an iterator over all non-overlapping leftmost matches in the - /// given bytes. If no match exists, then the iterator yields no elements. + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: /// - /// This corresponds to the "standard" regex search iterator. + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// # Panics + /// When a search panics, callers cannot know whether a match exists or + /// not. /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. + /// The above conditions also apply to the iterator returned as well. For + /// example, if the lazy DFA gives up or quits during a search using this + /// method, then a panic will occur during iteration. /// - /// The fallible version of this routine is - /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) + /// if you want to handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// - /// let text = b"foo1 foo12 foo123"; - /// let matches: Vec<MultiMatch> = re - /// .find_leftmost_iter(&mut cache, text) - /// .collect(); + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); /// assert_eq!(matches, vec![ - /// MultiMatch::must(0, 0, 4), - /// MultiMatch::must(0, 5, 10), - /// MultiMatch::must(0, 11, 17), + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn find_leftmost_iter<'r, 'c, 't>( - &'r self, - cache: &'c mut Cache, - haystack: &'t [u8], - ) -> FindLeftmostMatches<'r, 'c, 't> { - FindLeftmostMatches::new(self, cache, haystack) - } - - /// Returns an iterator over all overlapping matches in the given haystack. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// The iterator takes care of handling the overlapping state that must be - /// threaded through every search. - /// - /// # Panics - /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). - /// - /// # Example - /// - /// This example shows how to run an overlapping search with multiple - /// regexes. - /// - /// ``` - /// use regex_automata::{ - /// hybrid::{dfa::DFA, regex::Regex}, - /// MatchKind, - /// MultiMatch, - /// }; - /// - /// let re = Regex::builder() - /// .dfa(DFA::config().match_kind(MatchKind::All)) - /// .build_many(&[r"\w+$", r"\S+$"])?; - /// let mut cache = re.create_cache(); - /// let haystack = "@foo".as_bytes(); - /// - /// let mut it = re.find_overlapping_iter(&mut cache, haystack); - /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn find_overlapping_iter<'r, 'c, 't>( - &'r self, - cache: &'c mut Cache, - haystack: &'t [u8], - ) -> FindOverlappingMatches<'r, 'c, 't> { - FindOverlappingMatches::new(self, cache, haystack) - } -} - -/// Lower level infallible search routines that permit controlling where -/// the search starts and ends in a particular sequence. This is useful for -/// executing searches that need to take surrounding context into account. This -/// is required for correctly implementing iteration because of look-around -/// operators (`^`, `$`, `\b`). -impl Regex { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// # Panics - /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_is_match_at`](Regex::try_is_match_at). - pub fn is_match_at( - &self, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - ) -> bool { - self.try_is_match_at(cache, haystack, start, end).unwrap() - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_find_earliest_at`](Regex::try_find_earliest_at). - pub fn find_earliest_at( - &self, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - ) -> Option<MultiMatch> { - self.try_find_earliest_at(cache, haystack, start, end).unwrap() - } - - /// Returns the same as `find_leftmost`, but starts the search at the given - /// offset. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches within the - /// same haystack, which cannot be done correctly by simply providing a - /// subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). - pub fn find_leftmost_at( - &self, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - ) -> Option<MultiMatch> { - self.try_find_leftmost_at(cache, haystack, start, end).unwrap() - } - - /// Search for the first overlapping match within a given range of - /// `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying lazy DFAs return an error, then this routine panics. - /// This only occurs in non-default configurations where quit bytes are - /// used, Unicode word boundaries are heuristically enabled or limits are - /// set on the number of times the lazy DFA's cache may be cleared. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). - pub fn find_overlapping_at( - &self, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Option<MultiMatch> { - self.try_find_overlapping_at(cache, haystack, start, end, state) - .unwrap() - } -} - -/// Fallible search routines. These may return an error when the underlying -/// lazy DFAs have been configured in a way that permits them to fail during a -/// search. -/// -/// Errors during search only occur when the lazy DFA has been explicitly -/// configured to do so, usually by specifying one or more "quit" bytes or by -/// heuristically enabling Unicode word boundaries. -/// -/// Errors will never be returned using the default configuration. So these -/// fallible routines are only needed for particular configurations. -impl Regex { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`is_match`](Regex::is_match). - pub fn try_is_match( - &self, - cache: &mut Cache, - haystack: &[u8], - ) -> Result<bool, MatchError> { - self.try_is_match_at(cache, haystack, 0, haystack.len()) - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest`](Regex::find_earliest). - pub fn try_find_earliest( - &self, - cache: &mut Cache, - haystack: &[u8], - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_earliest_at(cache, haystack, 0, haystack.len()) - } - - /// Returns the start and end offset of the leftmost match. If no match - /// exists, then `None` is returned. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost`](Regex::find_leftmost). - pub fn try_find_leftmost( - &self, - cache: &mut Cache, - haystack: &[u8], - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_leftmost_at(cache, haystack, 0, haystack.len()) - } - - /// Search for the first overlapping match in `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping`](Regex::find_overlapping). - pub fn try_find_overlapping( - &self, - cache: &mut Cache, - haystack: &[u8], - state: &mut OverlappingState, - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state) - } - - /// Returns an iterator over all non-overlapping "earliest" matches. - /// - /// Match positions are reported as soon as a match is known to occur, even - /// if the standard leftmost match would be longer. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest_iter`](Regex::find_earliest_iter). - pub fn try_find_earliest_iter<'r, 'c, 't>( - &'r self, - cache: &'c mut Cache, - haystack: &'t [u8], - ) -> TryFindEarliestMatches<'r, 'c, 't> { - TryFindEarliestMatches::new(self, cache, haystack) - } - - /// Returns an iterator over all non-overlapping leftmost matches in the - /// given bytes. If no match exists, then the iterator yields no elements. - /// - /// This corresponds to the "standard" regex search iterator. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost_iter`](Regex::find_leftmost_iter). - pub fn try_find_leftmost_iter<'r, 'c, 't>( + #[inline] + pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, - haystack: &'t [u8], - ) -> TryFindLeftmostMatches<'r, 'c, 't> { - TryFindLeftmostMatches::new(self, cache, haystack) - } - - /// Returns an iterator over all overlapping matches in the given haystack. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// The iterator takes care of handling the overlapping state that must be - /// threaded through every search. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping_iter`](Regex::find_overlapping_iter). - pub fn try_find_overlapping_iter<'r, 'c, 't>( - &'r self, - cache: &'c mut Cache, - haystack: &'t [u8], - ) -> TryFindOverlappingMatches<'r, 'c, 't> { - TryFindOverlappingMatches::new(self, cache, haystack) + input: I, + ) -> FindMatches<'r, 'c, 'h> { + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, cache, it } } } -/// Lower level fallible search routines that permit controlling where the -/// search starts and ends in a particular sequence. +/// Lower level "search" primitives that accept a `&Input` for cheap reuse +/// and return an error if one occurs instead of panicking. impl Regex { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`is_match_at`](Regex::is_match_at). - pub fn try_is_match_at( - &self, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<bool, MatchError> { - self.forward() - .find_leftmost_fwd_at( - &mut cache.forward, - self.scanner().as_mut(), - None, - haystack, - start, - end, - ) - .map(|x| x.is_some()) - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest_at`](Regex::find_earliest_at). - pub fn try_find_earliest_at( - &self, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_earliest_at_imp( - self.scanner().as_mut(), - cache, - haystack, - start, - end, - ) - } - /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// - /// # Searching a substring of the haystack + /// This is like [`Regex::find`] but with two differences: /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. + /// 1. It is not generic over `Into<Input>` and instead accepts a + /// `&Input`. This permits reusing the same `Input` for multiple searches + /// without needing to create a new one. This _may_ help with latency. + /// 2. It returns an error if the search could not complete where as + /// [`Regex::find`] will panic. /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match + /// This routine errors if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the lazy DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the lazy DFA quitting. + /// * The configuration of the lazy DFA may also permit it to "give up" + /// on a search if it makes ineffective use of its transition table + /// cache. The default configuration does not enable this by default, + /// although it is typically a good idea to. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search returns an error, callers cannot know whether a match /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost_at`](Regex::find_leftmost_at). - pub fn try_find_leftmost_at( + #[inline] + pub fn try_search( &self, cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_leftmost_at_imp( - self.scanner().as_mut(), - cache, - haystack, - start, - end, - ) - } - - /// Search for the first overlapping match within a given range of - /// `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping_at`](Regex::find_overlapping_at). - pub fn try_find_overlapping_at( - &self, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result<Option<MultiMatch>, MatchError> { - self.try_find_overlapping_at_imp( - self.scanner().as_mut(), - cache, - haystack, - start, - end, - state, - ) - } -} - -impl Regex { - #[inline(always)] - fn try_find_earliest_at_imp( - &self, - pre: Option<&mut prefilter::Scanner>, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<Option<MultiMatch>, MatchError> { - let (fdfa, rdfa) = (self.forward(), self.reverse()); + input: &Input<'_>, + ) -> Result<Option<Match>, MatchError> { let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); - let end = match fdfa - .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)? - { + let end = match self.forward().try_search_fwd(fcache, input)? { None => return Ok(None), Some(end) => end, }; - // N.B. The only time we need to tell the reverse searcher the pattern - // to match is in the overlapping case, since it's ambiguous. In the - // earliest case, I have tentatively convinced myself that it isn't - // necessary and the reverse search will always find the same pattern - // to match as the forward search. But I lack a rigorous proof. Why not - // just provide the pattern anyway? Well, if it is needed, then leaving - // it out gives us a chance to find a witness. - let start = rdfa - .find_earliest_rev_at(rcache, None, haystack, start, end.offset())? - .expect("reverse search must match if forward search does"); - assert_eq!( - start.pattern(), - end.pattern(), - "forward and reverse search must match same pattern", - ); - assert!(start.offset() <= end.offset()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) - } - - #[inline(always)] - fn try_find_leftmost_at_imp( - &self, - pre: Option<&mut prefilter::Scanner>, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result<Option<MultiMatch>, MatchError> { - let (fdfa, rdfa) = (self.forward(), self.reverse()); - let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); - let end = match fdfa - .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)? - { - None => return Ok(None), - Some(end) => end, - }; - // N.B. The only time we need to tell the reverse searcher the pattern - // to match is in the overlapping case, since it's ambiguous. In the - // leftmost case, I have tentatively convinced myself that it isn't - // necessary and the reverse search will always find the same pattern - // to match as the forward search. But I lack a rigorous proof. Why not - // just provide the pattern anyway? Well, if it is needed, then leaving - // it out gives us a chance to find a witness. - let start = rdfa - .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())? + // This special cases an empty match at the beginning of the search. If + // our end matches our start, then since a reverse DFA can't match past + // the start, it must follow that our starting position is also our end + // position. So short circuit and skip the reverse search. + if input.start() == end.offset() { + return Ok(Some(Match::new( + end.pattern(), + end.offset()..end.offset(), + ))); + } + // We can also skip the reverse search if we know our search was + // anchored. This occurs either when the input config is anchored or + // when we know the regex itself is anchored. In this case, we know the + // start of the match, if one is found, must be the start of the + // search. + if self.is_anchored(input) { + return Ok(Some(Match::new( + end.pattern(), + input.start()..end.offset(), + ))); + } + // N.B. I have tentatively convinced myself that it isn't necessary + // to specify the specific pattern for the reverse search since the + // reverse search will always find the same pattern to match as the + // forward search. But I lack a rigorous proof. Why not just provide + // the pattern anyway? Well, if it is needed, then leaving it out + // gives us a chance to find a witness. (Also, if we don't need to + // specify the pattern, then we don't need to build the reverse DFA + // with 'starts_for_each_pattern' enabled. It doesn't matter too much + // for the lazy DFA, but does make the overall DFA bigger.) + // + // We also need to be careful to disable 'earliest' for the reverse + // search, since it could be enabled for the forward search. In the + // reverse case, to satisfy "leftmost" criteria, we need to match as + // much as we can. We also need to be careful to make the search + // anchored. We don't want the reverse search to report any matches + // other than the one beginning at the end of our forward search. + let revsearch = input + .clone() + .span(input.start()..end.offset()) + .anchored(Anchored::Yes) + .earliest(false); + let start = self + .reverse() + .try_search_rev(rcache, &revsearch)? .expect("reverse search must match if forward search does"); - assert_eq!( + debug_assert_eq!( start.pattern(), end.pattern(), "forward and reverse search must match same pattern", ); - assert!(start.offset() <= end.offset()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + debug_assert!(start.offset() <= end.offset()); + Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) } - #[inline(always)] - fn try_find_overlapping_at_imp( - &self, - pre: Option<&mut prefilter::Scanner>, - cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result<Option<MultiMatch>, MatchError> { - let (fdfa, rdfa) = (self.forward(), self.reverse()); - let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); - let end = match fdfa.find_overlapping_fwd_at( - fcache, pre, None, haystack, start, end, state, - )? { - None => return Ok(None), - Some(end) => end, - }; - // Unlike the leftmost cases, the reverse overlapping search may match - // a different pattern than the forward search. See test failures when - // using `None` instead of `Some(end.pattern())` below. Thus, we must - // run our reverse search using the pattern that matched in the forward - // direction. - let start = rdfa - .find_leftmost_rev_at( - rcache, - Some(end.pattern()), - haystack, - 0, - end.offset(), - )? - .expect("reverse search must match if forward search does"); - assert_eq!( - start.pattern(), - end.pattern(), - "forward and reverse search must match same pattern", - ); - assert!(start.offset() <= end.offset()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + /// Returns true if either the given input specifies an anchored search + /// or if the underlying NFA is always anchored. + fn is_anchored(&self, input: &Input<'_>) -> bool { + match input.get_anchored() { + Anchored::No => { + self.forward().get_nfa().is_always_start_anchored() + } + Anchored::Yes | Anchored::Pattern(_) => true, + } } } @@ -1386,360 +540,45 @@ impl Regex { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, hybrid::regex::Regex}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::hybrid::regex::Regex; /// /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; - /// assert_eq!(3, re.pattern_count()); + /// assert_eq!(3, re.pattern_len()); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` - pub fn pattern_count(&self) -> usize { - assert_eq!( - self.forward().pattern_count(), - self.reverse().pattern_count() - ); - self.forward().pattern_count() - } - - /// Convenience function for returning this regex's prefilter as a trait - /// object. - /// - /// If this regex doesn't have a prefilter, then `None` is returned. - pub fn prefilter(&self) -> Option<&dyn Prefilter> { - self.pre.as_ref().map(|x| &**x) - } - - /// Attach the given prefilter to this regex. - pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) { - self.pre = pre; - } - - /// Convenience function for returning a prefilter scanner. - fn scanner(&self) -> Option<prefilter::Scanner> { - self.prefilter().map(prefilter::Scanner::new) + pub fn pattern_len(&self) -> usize { + assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); + self.forward().pattern_len() } } -/// An iterator over all non-overlapping earliest matches for a particular -/// infallible search. +/// An iterator over all non-overlapping matches for an infallible search. /// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. +/// The iterator yields a [`Match`] value until no more matches could be found. +/// If the underlying regex engine returns an error, then a panic occurs. /// -/// The lifetime variables are as follows: +/// The lifetime parameters are as follows: /// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'c` is the lifetime of the mutable cache used during search. -/// * `'t` is the lifetime of the text being searched. -#[derive(Debug)] -pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>); - -impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> { - fn new( - re: &'r Regex, - cache: &'c mut Cache, - text: &'t [u8], - ) -> FindEarliestMatches<'r, 'c, 't> { - FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text)) - } -} - -impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> { - type Item = MultiMatch; - - fn next(&mut self) -> Option<MultiMatch> { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all non-overlapping leftmost matches for a particular -/// infallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. -/// -/// The lifetime variables are as follows: +/// * `'r` represents the lifetime of the regex object. +/// * `'h` represents the lifetime of the haystack being searched. +/// * `'c` represents the lifetime of the regex cache. /// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'c` is the lifetime of the mutable cache used during search. -/// * `'t` is the lifetime of the text being searched. +/// This iterator can be created with the [`Regex::find_iter`] method. #[derive(Debug)] -pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>); - -impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> { - fn new( - re: &'r Regex, - cache: &'c mut Cache, - text: &'t [u8], - ) -> FindLeftmostMatches<'r, 'c, 't> { - FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text)) - } -} - -impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> { - type Item = MultiMatch; - - fn next(&mut self) -> Option<MultiMatch> { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all overlapping matches for a particular infallible -/// search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. -/// -/// The lifetime variables are as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'c` is the lifetime of the mutable cache used during search. -/// * `'t` is the lifetime of the text being searched. -#[derive(Debug)] -pub struct FindOverlappingMatches<'r, 'c, 't>( - TryFindOverlappingMatches<'r, 'c, 't>, -); - -impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> { - fn new( - re: &'r Regex, - cache: &'c mut Cache, - text: &'t [u8], - ) -> FindOverlappingMatches<'r, 'c, 't> { - FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text)) - } -} - -impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> { - type Item = MultiMatch; - - fn next(&mut self) -> Option<MultiMatch> { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all non-overlapping earliest matches for a particular -/// fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// The lifetime variables are as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'c` is the lifetime of the mutable cache used during search. -/// * `'t` is the lifetime of the text being searched. -#[derive(Debug)] -pub struct TryFindEarliestMatches<'r, 'c, 't> { - re: &'r Regex, - cache: &'c mut Cache, - scanner: Option<prefilter::Scanner<'r>>, - text: &'t [u8], - last_end: usize, - last_match: Option<usize>, -} - -impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> { - fn new( - re: &'r Regex, - cache: &'c mut Cache, - text: &'t [u8], - ) -> TryFindEarliestMatches<'r, 'c, 't> { - let scanner = re.scanner(); - TryFindEarliestMatches { - re, - cache, - scanner, - text, - last_end: 0, - last_match: None, - } - } -} - -impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> { - type Item = Result<MultiMatch, MatchError>; - - fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_earliest_at_imp( - self.scanner.as_mut(), - self.cache, - self.text, - self.last_end, - self.text.len(), - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.is_empty() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = if self.re.utf8 { - crate::util::next_utf8(self.text, m.end()) - } else { - m.end() + 1 - }; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(m)) - } -} - -/// An iterator over all non-overlapping leftmost matches for a particular -/// fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// The lifetime variables are as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'c` is the lifetime of the mutable cache used during search. -/// * `'t` is the lifetime of the text being searched. -#[derive(Debug)] -pub struct TryFindLeftmostMatches<'r, 'c, 't> { - re: &'r Regex, - cache: &'c mut Cache, - scanner: Option<prefilter::Scanner<'r>>, - text: &'t [u8], - last_end: usize, - last_match: Option<usize>, -} - -impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> { - fn new( - re: &'r Regex, - cache: &'c mut Cache, - text: &'t [u8], - ) -> TryFindLeftmostMatches<'r, 'c, 't> { - let scanner = re.scanner(); - TryFindLeftmostMatches { - re, - cache, - scanner, - text, - last_end: 0, - last_match: None, - } - } -} - -impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> { - type Item = Result<MultiMatch, MatchError>; - - fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_leftmost_at_imp( - self.scanner.as_mut(), - self.cache, - self.text, - self.last_end, - self.text.len(), - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.is_empty() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = if self.re.utf8 { - crate::util::next_utf8(self.text, m.end()) - } else { - m.end() + 1 - }; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(m)) - } -} - -/// An iterator over all overlapping matches for a particular fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// The lifetime variables are as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'c` is the lifetime of the mutable cache used during search. -/// * `'t` is the lifetime of the text being searched. -#[derive(Debug)] -pub struct TryFindOverlappingMatches<'r, 'c, 't> { +pub struct FindMatches<'r, 'c, 'h> { re: &'r Regex, cache: &'c mut Cache, - scanner: Option<prefilter::Scanner<'r>>, - text: &'t [u8], - last_end: usize, - state: OverlappingState, + it: iter::Searcher<'h>, } -impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> { - fn new( - re: &'r Regex, - cache: &'c mut Cache, - text: &'t [u8], - ) -> TryFindOverlappingMatches<'r, 'c, 't> { - let scanner = re.scanner(); - TryFindOverlappingMatches { - re, - cache, - scanner, - text, - last_end: 0, - state: OverlappingState::start(), - } - } -} +impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { + type Item = Match; -impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> { - type Item = Result<MultiMatch, MatchError>; - - fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_overlapping_at_imp( - self.scanner.as_mut(), - self.cache, - self.text, - self.last_end, - self.text.len(), - &mut self.state, - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - // Unlike the non-overlapping case, we're OK with empty matches at this - // level. In particular, the overlapping search algorithm is itself - // responsible for ensuring that progress is always made. - self.last_end = m.end(); - Some(Ok(m)) + #[inline] + fn next(&mut self) -> Option<Match> { + let FindMatches { re, ref mut cache, ref mut it } = *self; + it.advance(|input| re.try_search(cache, input)) } } @@ -1791,15 +630,16 @@ impl Cache { /// This shows how to re-purpose a cache for use with a different `Regex`. /// /// ``` - /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re1 = Regex::new(r"\w")?; /// let re2 = Regex::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 2)), - /// re1.find_leftmost(&mut cache, "Δ".as_bytes()), + /// Some(Match::must(0, 0..2)), + /// re1.find(&mut cache, "Δ"), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or @@ -1810,8 +650,8 @@ impl Cache { /// // allowed. /// cache.reset(&re2); /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 3)), - /// re2.find_leftmost(&mut cache, "☃".as_bytes()), + /// Some(Match::must(0, 0..3)), + /// re2.find(&mut cache, "☃"), /// ); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) @@ -1821,13 +661,30 @@ impl Cache { self.reverse.reset(re.reverse()); } - /// Returns the heap memory usage, in bytes, as a sum of the forward and - /// reverse lazy DFA caches. + /// Return a reference to the forward cache. + pub fn forward(&mut self) -> &dfa::Cache { + &self.forward + } + + /// Return a reference to the reverse cache. + pub fn reverse(&mut self) -> &dfa::Cache { + &self.reverse + } + + /// Return a mutable reference to the forward cache. /// - /// This does **not** include the stack size used up by this cache. To - /// compute that, use `std::mem::size_of::<Cache>()`. - pub fn memory_usage(&self) -> usize { - self.forward.memory_usage() + self.reverse.memory_usage() + /// If you need mutable references to both the forward and reverse caches, + /// then use [`Cache::as_parts_mut`]. + pub fn forward_mut(&mut self) -> &mut dfa::Cache { + &mut self.forward + } + + /// Return a mutable reference to the reverse cache. + /// + /// If you need mutable references to both the forward and reverse caches, + /// then use [`Cache::as_parts_mut`]. + pub fn reverse_mut(&mut self) -> &mut dfa::Cache { + &mut self.reverse } /// Return references to the forward and reverse caches, respectively. @@ -1840,111 +697,14 @@ impl Cache { pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) { (&mut self.forward, &mut self.reverse) } -} -/// The configuration used for compiling a hybrid NFA/DFA regex. -/// -/// A regex configuration is a simple data object that is typically used with -/// [`Builder::configure`]. -#[derive(Clone, Copy, Debug, Default)] -pub struct Config { - utf8: Option<bool>, -} - -impl Config { - /// Return a new default regex compiler configuration. - pub fn new() -> Config { - Config::default() - } - - /// Whether to enable UTF-8 mode or not. - /// - /// When UTF-8 mode is enabled (the default) and an empty match is seen, - /// the iterators on [`Regex`] will always start the next search at the - /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 - /// mode is disabled, such searches are begun at the next byte offset. - /// - /// If this mode is enabled and invalid UTF-8 is given to search, then - /// behavior is unspecified. - /// - /// Generally speaking, one should enable this when - /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) - /// and - /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) - /// are enabled, and disable it otherwise. - /// - /// # Example - /// - /// This example demonstrates the differences between when this option is - /// enabled and disabled. The differences only arise when the regex can - /// return matches of length zero. - /// - /// In this first snippet, we show the results when UTF-8 mode is disabled. - /// - /// ``` - /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .build(r"")?; - /// let mut cache = re.create_cache(); - /// - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(&mut cache, haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - /// - /// And in this snippet, we execute the same search on the same haystack, - /// but with UTF-8 mode enabled. Notice that byte offsets that would - /// otherwise split the encoding of `☃` are not returned. - /// - /// ``` - /// use regex_automata::{hybrid::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(true)) - /// .build(r"")?; - /// let mut cache = re.create_cache(); - /// - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(&mut cache, haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); + /// Returns the heap memory usage, in bytes, as a sum of the forward and + /// reverse lazy DFA caches. /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - pub fn utf8(mut self, yes: bool) -> Config { - self.utf8 = Some(yes); - self - } - - /// Returns true if and only if this configuration has UTF-8 mode enabled. - /// - /// When UTF-8 mode is enabled and an empty match is seen, the iterators on - /// [`Regex`] will always start the next search at the next UTF-8 encoded - /// codepoint. When UTF-8 mode is disabled, such searches are begun at the - /// next byte offset. - pub fn get_utf8(&self) -> bool { - self.utf8.unwrap_or(true) - } - - /// Overwrite the default configuration such that the options in `o` are - /// always used. If an option in `o` is not set, then the corresponding - /// option in `self` is used. If it's not set in `self` either, then it - /// remains not set. - pub(crate) fn overwrite(self, o: Config) -> Config { - Config { utf8: o.utf8.or(self.utf8) } + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.forward.memory_usage() + self.reverse.memory_usage() } } @@ -1955,17 +715,15 @@ impl Config { /// itself. This builder is different from a general purpose regex builder /// in that it permits fine grain configuration of the construction process. /// The trade off for this is complexity, and the possibility of setting a -/// configuration that might not make sense. For example, there are three +/// configuration that might not make sense. For example, there are two /// different UTF-8 modes: /// -/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the -/// pattern itself can contain sub-expressions that match invalid UTF-8. -/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) -/// controls whether the implicit unanchored prefix added to the NFA can -/// match through invalid UTF-8 or not. -/// * [`Config::utf8`] controls how the regex iterators themselves advance -/// the starting position of the next search when a match with zero length is -/// found. +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`] controls how the regex iterators themselves +/// advance the starting position of the next search when a match with zero +/// length is found. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. @@ -1979,61 +737,54 @@ impl Config { /// /// # Example /// -/// This example shows how to disable UTF-8 mode in the syntax, the NFA and -/// the regex itself. This is generally what you want for matching on -/// arbitrary bytes. +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ -/// hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() -/// .configure(Regex::config().utf8(false)) -/// .syntax(SyntaxConfig::new().utf8(false)) +/// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; -/// let expected = Some(MultiMatch::must(0, 1, 9)); -/// let got = re.find_leftmost(&mut cache, haystack); +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find(&mut cache, haystack); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 -/// // on the syntax permits this. Notice also that the -/// // search was unanchored and skipped over invalid UTF-8. -/// // Disabling UTF-8 on the Thompson NFA permits this. -/// // -/// // N.B. This example does not show the impact of -/// // disabling UTF-8 mode on Config, since that -/// // only impacts regexes that can produce matches of -/// // length 0. +/// // on the syntax permits this. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { - config: Config, dfa: dfa::Builder, } impl Builder { /// Create a new regex builder with the default configuration. pub fn new() -> Builder { - Builder { config: Config::default(), dfa: DFA::builder() } + Builder { dfa: DFA::builder() } } /// Build a regex from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. + #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> { self.build_many(&[pattern]) } /// Build a regex from the given patterns. + #[cfg(feature = "syntax")] pub fn build_many<P: AsRef<str>>( &self, patterns: &[P], @@ -2044,9 +795,9 @@ impl Builder { .clone() .configure( DFA::config() - .anchored(true) - .match_kind(MatchKind::All) - .starts_for_each_pattern(true), + .prefilter(None) + .specialize_start_states(false) + .match_kind(MatchKind::All), ) .thompson(thompson::Config::new().reverse(true)) .build_many(patterns)?; @@ -2054,28 +805,62 @@ impl Builder { } /// Build a regex from its component forward and reverse hybrid NFA/DFAs. - fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { - // The congruous method on DFA-backed regexes is exposed, but it's - // not clear this builder is useful here since lazy DFAs can't be - // serialized and there is only one type of them. - let utf8 = self.config.get_utf8(); - Regex { pre: None, forward, reverse, utf8 } - } - - /// Apply the given regex configuration options to this builder. - pub fn configure(&mut self, config: Config) -> &mut Builder { - self.config = self.config.overwrite(config); - self + /// + /// This is useful when you've built a forward and reverse lazy DFA + /// separately, and want to combine them into a single regex. Once build, + /// the individual DFAs given can still be accessed via [`Regex::forward`] + /// and [`Regex::reverse`]. + /// + /// It is important that the reverse lazy DFA be compiled under the + /// following conditions: + /// + /// * It should use [`MatchKind::All`] semantics. + /// * It should match in reverse. + /// * Otherwise, its configuration should match the forward DFA. + /// + /// If these conditions aren't satisfied, then the behavior of searches is + /// unspecified. + /// + /// Note that when using this constructor, no configuration is applied. + /// Since this routine provides the DFAs to the builder, there is no + /// opportunity to apply other configuration options. + /// + /// # Example + /// + /// This shows how to build individual lazy forward and reverse DFAs, and + /// then combine them into a single `Regex`. + /// + /// ``` + /// use regex_automata::{ + /// hybrid::{dfa::DFA, regex::Regex}, + /// nfa::thompson, + /// MatchKind, + /// }; + /// + /// let fwd = DFA::new(r"foo[0-9]+")?; + /// let rev = DFA::builder() + /// .configure(DFA::config().match_kind(MatchKind::All)) + /// .thompson(thompson::Config::new().reverse(true)) + /// .build(r"foo[0-9]+")?; + /// + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// let mut cache = re.create_cache(); + /// assert_eq!(true, re.is_match(&mut cache, "foo123")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { + Regex { forward, reverse } } /// Set the syntax configuration for this builder using - /// [`SyntaxConfig`](crate::SyntaxConfig). + /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. + #[cfg(feature = "syntax")] pub fn syntax( &mut self, - config: crate::util::syntax::SyntaxConfig, + config: crate::util::syntax::Config, ) -> &mut Builder { self.dfa.syntax(config); self @@ -2086,6 +871,7 @@ impl Builder { /// /// This permits setting things like whether additional time should be /// spent shrinking the size of the NFA. + #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.dfa.thompson(config); self @@ -2107,18 +893,3 @@ impl Default for Builder { Builder::new() } } - -#[inline(always)] -fn next_unwrap( - item: Option<Result<MultiMatch, MatchError>>, -) -> Option<MultiMatch> { - match item { - None => None, - Some(Ok(m)) => Some(m), - Some(Err(err)) => panic!( - "unexpected regex search error: {}\n\ - to handle search errors, use try_ methods", - err, - ), - } -} |