From c23a457e72abe608715ac76f076f47dc42af07a5 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 30 May 2024 20:31:44 +0200 Subject: Merging upstream version 1.74.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/regex-automata/src/dfa/regex.rs | 1825 +++++--------------------------- 1 file changed, 275 insertions(+), 1550 deletions(-) (limited to 'vendor/regex-automata/src/dfa/regex.rs') diff --git a/vendor/regex-automata/src/dfa/regex.rs b/vendor/regex-automata/src/dfa/regex.rs index d0917e17d..f39c1c055 100644 --- a/vendor/regex-automata/src/dfa/regex.rs +++ b/vendor/regex-automata/src/dfa/regex.rs @@ -18,16 +18,17 @@ See the [parent module](crate::dfa) for examples. #[cfg(feature = "alloc")] use alloc::vec::Vec; +#[cfg(feature = "dfa-build")] +use crate::dfa::dense::BuildError; use crate::{ - dfa::automaton::{Automaton, OverlappingState}, - util::prefilter::{self, Prefilter}, - MatchError, MultiMatch, + dfa::{automaton::Automaton, dense}, + util::{iter, search::Input}, + Anchored, Match, MatchError, }; #[cfg(feature = "alloc")] use crate::{ - dfa::{dense, error::Error, sparse}, - nfa::thompson, - util::matchtypes::MatchKind, + dfa::{sparse, StartKind}, + util::search::MatchKind, }; // When the alloc feature is enabled, the regex type sets its A type parameter @@ -42,20 +43,16 @@ macro_rules! define_regex_type { ($(#[$doc:meta])*) => { #[cfg(feature = "alloc")] $(#[$doc])* - pub struct Regex { - prefilter: Option

, + pub struct Regex { forward: A, reverse: A, - utf8: bool, } #[cfg(not(feature = "alloc"))] $(#[$doc])* - pub struct Regex { - prefilter: Option

, + pub struct Regex { forward: A, reverse: A, - utf8: bool, } }; } @@ -79,86 +76,26 @@ define_regex_type!( /// memory but search faster, while sparse DFAs use less memory but search /// more slowly. /// + /// # Crate features + /// + /// Note that despite what the documentation auto-generates, the _only_ + /// crate feature needed to use this type is `dfa-search`. You do _not_ + /// need to enable the `alloc` feature. + /// /// By default, a regex's automaton type parameter is set to /// `dense::DFA>` when the `alloc` feature is enabled. For most /// in-memory work loads, this is the most convenient type that gives the /// best search performance. When the `alloc` feature is disabled, no /// default type is used. /// - /// A `Regex` also has a `P` type parameter, which is used to select the - /// prefilter used during search. By default, no prefilter is enabled by - /// setting the type to default to [`prefilter::None`]. A prefilter can be - /// enabled by using the [`Regex::prefilter`] method. - /// /// # When should I use this? /// /// Generally speaking, if you can afford the overhead of building a full /// DFA for your regex, and you don't need things like capturing groups, /// then this is a good choice if you're looking to optimize for matching /// speed. Note however that its speed may be worse than a general purpose - /// regex engine if you don't select a good [prefilter]. - /// - /// # Earliest vs Leftmost vs Overlapping - /// - /// The search routines exposed on a `Regex` reflect three different ways - /// of searching: - /// - /// * "earliest" means to stop as soon as a match has been detected. - /// * "leftmost" means to continue matching until the underlying - /// automaton cannot advance. This reflects "standard" searching you - /// might be used to in other regex engines. e.g., This permits - /// non-greedy and greedy searching to work as you would expect. - /// * "overlapping" means to find all possible matches, even if they - /// overlap. - /// - /// Generally speaking, when doing an overlapping search, you'll want to - /// build your regex DFAs with [`MatchKind::All`] semantics. Using - /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is - /// likely to lead to odd behavior since `LeftmostFirst` specifically omits - /// some matches that can never be reported due to its semantics. - /// - /// The following example shows the differences between how these different - /// types of searches impact looking for matches of `[a-z]+` in the - /// haystack `abc`. - /// - /// ``` - /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; - /// - /// let pattern = r"[a-z]+"; - /// let haystack = "abc".as_bytes(); - /// - /// // With leftmost-first semantics, we test "earliest" and "leftmost". - /// let re = dfa::regex::Builder::new() - /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) - /// .build(pattern)?; - /// - /// // "earliest" searching isn't impacted by greediness - /// let mut it = re.find_earliest_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// // "leftmost" searching supports greediness (and non-greediness) - /// let mut it = re.find_leftmost_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// // For overlapping, we want "all" match kind semantics. - /// let re = dfa::regex::Builder::new() - /// .dense(dense::Config::new().match_kind(MatchKind::All)) - /// .build(pattern)?; - /// - /// // In the overlapping search, we find all three possible matches - /// // starting at the beginning of the haystack. - /// let mut it = re.find_overlapping_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box>(()) - /// ``` + /// regex engine if you don't provide a [`dense::Config::prefilter`] to the + /// underlying DFA. /// /// # Sparse DFAs /// @@ -203,18 +140,16 @@ define_regex_type!( /// /// # Fallibility /// - /// In non-default configurations, the DFAs generated in this module may - /// return an error during a search. (Currently, the only way this happens - /// is if quit bytes are added or Unicode word boundaries are heuristically - /// enabled, both of which are turned off by default.) For convenience, the - /// main search routines, like [`find_leftmost`](Regex::find_leftmost), - /// will panic if an error occurs. However, if you need to use DFAs - /// which may produce an error at search time, then there are fallible - /// equivalents of all search routines. For example, for `find_leftmost`, - /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). - /// The routines prefixed with `try_` return `Result, - /// MatchError>`, where as the infallible routines simply return - /// `Option`. + /// Most of the search routines defined on this type will _panic_ when the + /// underlying search fails. This might be because the DFA gave up because + /// it saw a quit byte, whether configured explicitly or via heuristic + /// Unicode word boundary support, although neither are enabled by default. + /// Or it might fail because an invalid `Input` configuration is given, + /// for example, with an unsupported [`Anchored`] mode. + /// + /// If you need to handle these error cases instead of allowing them to + /// trigger a panic, then the lower level [`Regex::try_search`] provides + /// a fallible API that never panics. /// /// # Example /// @@ -224,18 +159,19 @@ define_regex_type!( /// across a line boundary. /// /// ``` - /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError}; /// /// let re = Regex::builder() /// .dense(dfa::dense::Config::new().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// - /// let haystack = "foo\nbar".as_bytes(); + /// let input = Input::new("foo\nbar"); /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. - /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; - /// let got = re.try_find_leftmost(haystack).unwrap_err(); + /// let expected = MatchError::quit(b'\n', 3); + /// let got = re.try_search(&input).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) @@ -243,7 +179,7 @@ define_regex_type!( #[derive(Clone, Debug)] ); -#[cfg(feature = "alloc")] +#[cfg(all(feature = "syntax", feature = "dfa-build"))] impl Regex { /// Parse the given regular expression using the default configuration and /// return the corresponding regex. @@ -254,16 +190,16 @@ impl Regex { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+bar")?; /// assert_eq!( - /// Some(MultiMatch::must(0, 3, 14)), - /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// Some(Match::must(0, 3..14)), + /// re.find(b"zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box>(()) /// ``` - pub fn new(pattern: &str) -> Result { + pub fn new(pattern: &str) -> Result { Builder::new().build(pattern) } @@ -273,26 +209,28 @@ impl Regex { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; /// - /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` - pub fn new_many>(patterns: &[P]) -> Result { + pub fn new_many>( + patterns: &[P], + ) -> Result { Builder::new().build_many(patterns) } } -#[cfg(feature = "alloc")] +#[cfg(all(feature = "syntax", feature = "dfa-build"))] impl Regex>> { /// Parse the given regular expression using the default configuration, /// except using sparse DFAs, and return the corresponding regex. @@ -303,18 +241,18 @@ impl Regex>> { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_sparse("foo[0-9]+bar")?; /// assert_eq!( - /// Some(MultiMatch::must(0, 3, 14)), - /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// Some(Match::must(0, 3..14)), + /// re.find(b"zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box>(()) /// ``` pub fn new_sparse( pattern: &str, - ) -> Result>>, Error> { + ) -> Result>>, BuildError> { Builder::new().build_sparse(pattern) } @@ -325,64 +263,29 @@ impl Regex>> { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; /// - /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); - /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` pub fn new_many_sparse>( patterns: &[P], - ) -> Result>>, Error> { + ) -> Result>>, BuildError> { Builder::new().build_many_sparse(patterns) } } /// Convenience routines for regex construction. -#[cfg(feature = "alloc")] -impl Regex { - /// Return a default configuration for a `Regex`. - /// - /// This is a convenience routine to avoid needing to import the `Config` - /// type when customizing the construction of a regex. - /// - /// # Example - /// - /// This example shows how to disable UTF-8 mode for `Regex` iteration. - /// When UTF-8 mode is disabled, the position immediately following an - /// empty match is where the next search begins, instead of the next - /// position of a UTF-8 encoded codepoint. - /// - /// ``` - /// use regex_automata::{dfa::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .build(r"")?; - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box>(()) - /// ``` - pub fn config() -> Config { - Config::new() - } - +impl Regex> { /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the @@ -394,20 +297,18 @@ impl Regex { /// everywhere. /// /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ - /// dfa::regex::Regex, - /// nfa::thompson, - /// MultiMatch, SyntaxConfig, + /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .syntax(SyntaxConfig::new().utf8(false)) + /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; - /// let expected = Some(MultiMatch::must(0, 1, 9)); - /// let got = re.find_leftmost(haystack); + /// let expected = Some(Match::must(0, 1..9)); + /// let got = re.find(haystack); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) @@ -418,7 +319,7 @@ impl Regex { } /// Standard search routines for finding and iterating over matches. -impl Regex { +impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input @@ -428,65 +329,37 @@ impl Regex { /// /// # Panics /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_is_match`](Regex::try_is_match). - /// - /// # Example - /// - /// ``` - /// use regex_automata::dfa::regex::Regex; + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: /// - /// let re = Regex::new("foo[0-9]+bar")?; - /// assert_eq!(true, re.is_match(b"foo12345bar")); - /// assert_eq!(false, re.is_match(b"foobar")); - /// # Ok::<(), Box>(()) - /// ``` - pub fn is_match(&self, haystack: &[u8]) -> bool { - self.is_match_at(haystack, 0, haystack.len()) - } - - /// Returns the first position at which a match is found. + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. + /// When a search panics, callers cannot know whether a match exists or + /// not. /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_earliest`](Regex::try_find_earliest). + /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; - /// - /// // Normally, the leftmost first match would greedily consume as many - /// // decimal digits as it could. But a match is detected as soon as one - /// // digit is seen. - /// let re = Regex::new("foo[0-9]+")?; - /// assert_eq!( - /// Some(MultiMatch::must(0, 0, 4)), - /// re.find_earliest(b"foo12345"), - /// ); + /// use regex_automata::dfa::regex::Regex; /// - /// // Normally, the end of the leftmost first match here would be 3, - /// // but the "earliest" match semantics detect a match earlier. - /// let re = Regex::new("abc|a")?; - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc")); + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!(true, re.is_match("foo12345bar")); + /// assert_eq!(false, re.is_match("foobar")); /// # Ok::<(), Box>(()) /// ``` - pub fn find_earliest(&self, haystack: &[u8]) -> Option { - self.find_earliest_at(haystack, 0, haystack.len()) + #[inline] + pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { + // Not only can we do an "earliest" search, but we can avoid doing a + // reverse scan too. + let input = input.into().earliest(true); + self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap() } /// Returns the start and end offset of the leftmost match. If no match @@ -494,131 +367,41 @@ impl Regex { /// /// # Panics /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. + /// This routine panics if the search could not complete. This can occur + /// in a number of circumstances: + /// + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. /// - /// The fallible version of this routine is - /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// - /// // Greediness is applied appropriately when compared to find_earliest. + /// // Greediness is applied appropriately. /// let re = Regex::new("foo[0-9]+")?; - /// assert_eq!( - /// Some(MultiMatch::must(0, 3, 11)), - /// re.find_leftmost(b"zzzfoo12345zzz"), - /// ); + /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the default leftmost-first match semantics demand that we find the /// // earliest match that prefers earlier parts of the pattern over latter /// // parts. /// let re = Regex::new("abc|a")?; - /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc")); - /// # Ok::<(), Box>(()) - /// ``` - pub fn find_leftmost(&self, haystack: &[u8]) -> Option { - self.find_leftmost_at(haystack, 0, haystack.len()) - } - - /// Search for the first overlapping match in `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping`](Regex::try_find_overlapping). - /// - /// # Example - /// - /// This example shows how to run an overlapping search with multiple - /// regexes. - /// - /// ``` - /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; - /// - /// let re = Regex::builder() - /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) - /// .build_many(&[r"\w+$", r"\S+$"])?; - /// let haystack = "@foo".as_bytes(); - /// let mut state = dfa::OverlappingState::start(); - /// - /// let expected = Some(MultiMatch::must(1, 0, 4)); - /// let got = re.find_overlapping(haystack, &mut state); - /// assert_eq!(expected, got); - /// - /// // The first pattern also matches at the same position, so re-running - /// // the search will yield another match. Notice also that the first - /// // pattern is returned after the second. This is because the second - /// // pattern begins its match before the first, is therefore an earlier - /// // match and is thus reported first. - /// let expected = Some(MultiMatch::must(0, 1, 4)); - /// let got = re.find_overlapping(haystack, &mut state); - /// assert_eq!(expected, got); - /// + /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc")); /// # Ok::<(), Box>(()) /// ``` - pub fn find_overlapping( - &self, - haystack: &[u8], - state: &mut OverlappingState, - ) -> Option { - self.find_overlapping_at(haystack, 0, haystack.len(), state) - } - - /// Returns an iterator over all non-overlapping "earliest" matches. - /// - /// Match positions are reported as soon as a match is known to occur, even - /// if the standard leftmost match would be longer. - /// - /// # Panics - /// - /// If the underlying DFAs return an error during iteration, then iteration - /// panics. This only occurs in non-default configurations where quit bytes - /// are used or Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). - /// - /// # Example - /// - /// This example shows how to run an "earliest" iterator. - /// - /// ``` - /// use regex_automata::{dfa::regex::Regex, MultiMatch}; - /// - /// let re = Regex::new("[0-9]+")?; - /// let haystack = "123".as_bytes(); - /// - /// // Normally, a standard leftmost iterator would return a single - /// // match, but since "earliest" detects matches earlier, we get - /// // three matches. - /// let mut it = re.find_earliest_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box>(()) - /// ``` - pub fn find_earliest_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> FindEarliestMatches<'r, 't, A, P> { - FindEarliestMatches::new(self, haystack) + #[inline] + pub fn find<'h, I: Into>>(&self, input: I) -> Option { + self.try_search(&input.into()).unwrap() } /// Returns an iterator over all non-overlapping leftmost matches in the @@ -628,621 +411,119 @@ impl Regex { /// /// # Panics /// - /// If the underlying DFAs return an error during iteration, then iteration - /// panics. This only occurs in non-default configurations where quit bytes - /// are used or Unicode word boundaries are heuristically enabled. + /// If the search returns an error during iteration, then iteration + /// panics. See [`Regex::find`] for the panic conditions. /// - /// The fallible version of this routine is - /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// Use [`Regex::try_search`] with + /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to + /// handle these error conditions. /// /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+")?; - /// let text = b"foo1 foo12 foo123"; - /// let matches: Vec = re.find_leftmost_iter(text).collect(); + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec = re.find_iter(text).collect(); /// assert_eq!(matches, vec![ - /// MultiMatch::must(0, 0, 4), - /// MultiMatch::must(0, 5, 10), - /// MultiMatch::must(0, 11, 17), + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` - pub fn find_leftmost_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> FindLeftmostMatches<'r, 't, A, P> { - FindLeftmostMatches::new(self, haystack) - } - - /// Returns an iterator over all overlapping matches in the given haystack. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// The iterator takes care of handling the overlapping state that must be - /// threaded through every search. - /// - /// # Panics - /// - /// If the underlying DFAs return an error during iteration, then iteration - /// panics. This only occurs in non-default configurations where quit bytes - /// are used or Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). - /// - /// # Example - /// - /// This example shows how to run an overlapping search with multiple - /// regexes. - /// - /// ``` - /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; - /// - /// let re = Regex::builder() - /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) - /// .build_many(&[r"\w+$", r"\S+$"])?; - /// let haystack = "@foo".as_bytes(); - /// - /// let mut it = re.find_overlapping_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box>(()) - /// ``` - pub fn find_overlapping_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> FindOverlappingMatches<'r, 't, A, P> { - FindOverlappingMatches::new(self, haystack) - } -} - -/// Lower level infallible search routines that permit controlling where -/// the search starts and ends in a particular sequence. This is useful for -/// executing searches that need to take surrounding context into account. This -/// is required for correctly implementing iteration because of look-around -/// operators (`^`, `$`, `\b`). -impl Regex { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_is_match_at`](Regex::try_is_match_at). - pub fn is_match_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> bool { - self.try_is_match_at(haystack, start, end).unwrap() - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_earliest_at`](Regex::try_find_earliest_at). - pub fn find_earliest_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Option { - self.try_find_earliest_at(haystack, start, end).unwrap() - } - - /// Returns the same as `find_leftmost`, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, if the DFA is anchored, then - /// a match can only occur when `start == 0`. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches within the - /// same haystack, which cannot be done correctly by simply providing a - /// subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). - pub fn find_leftmost_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Option { - self.try_find_leftmost_at(haystack, start, end).unwrap() - } - - /// Search for the first overlapping match within a given range of - /// `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Panics - /// - /// If the underlying DFAs return an error, then this routine panics. This - /// only occurs in non-default configurations where quit bytes are used or - /// Unicode word boundaries are heuristically enabled. - /// - /// The fallible version of this routine is - /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). - pub fn find_overlapping_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Option { - self.try_find_overlapping_at(haystack, start, end, state).unwrap() - } -} - -/// Fallible search routines. These may return an error when the underlying -/// DFAs have been configured in a way that permits them to fail during a -/// search. -/// -/// Errors during search only occur when the DFA has been explicitly -/// configured to do so, usually by specifying one or more "quit" bytes or by -/// heuristically enabling Unicode word boundaries. -/// -/// Errors will never be returned using the default configuration. So these -/// fallible routines are only needed for particular configurations. -impl Regex { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`is_match`](Regex::is_match). - pub fn try_is_match(&self, haystack: &[u8]) -> Result { - self.try_is_match_at(haystack, 0, haystack.len()) - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest`](Regex::find_earliest). - pub fn try_find_earliest( - &self, - haystack: &[u8], - ) -> Result, MatchError> { - self.try_find_earliest_at(haystack, 0, haystack.len()) - } - - /// Returns the start and end offset of the leftmost match. If no match - /// exists, then `None` is returned. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost`](Regex::find_leftmost). - pub fn try_find_leftmost( - &self, - haystack: &[u8], - ) -> Result, MatchError> { - self.try_find_leftmost_at(haystack, 0, haystack.len()) - } - - /// Search for the first overlapping match in `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping`](Regex::find_overlapping). - pub fn try_find_overlapping( - &self, - haystack: &[u8], - state: &mut OverlappingState, - ) -> Result, MatchError> { - self.try_find_overlapping_at(haystack, 0, haystack.len(), state) - } - - /// Returns an iterator over all non-overlapping "earliest" matches. - /// - /// Match positions are reported as soon as a match is known to occur, even - /// if the standard leftmost match would be longer. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest_iter`](Regex::find_earliest_iter). - pub fn try_find_earliest_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> TryFindEarliestMatches<'r, 't, A, P> { - TryFindEarliestMatches::new(self, haystack) - } - - /// Returns an iterator over all non-overlapping leftmost matches in the - /// given bytes. If no match exists, then the iterator yields no elements. - /// - /// This corresponds to the "standard" regex search iterator. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost_iter`](Regex::find_leftmost_iter). - pub fn try_find_leftmost_iter<'r, 't>( - &'r self, - haystack: &'t [u8], - ) -> TryFindLeftmostMatches<'r, 't, A, P> { - TryFindLeftmostMatches::new(self, haystack) - } - - /// Returns an iterator over all overlapping matches in the given haystack. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// The iterator takes care of handling the overlapping state that must be - /// threaded through every search. - /// - /// # Errors - /// - /// This iterator only yields errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping_iter`](Regex::find_overlapping_iter). - pub fn try_find_overlapping_iter<'r, 't>( + #[inline] + pub fn find_iter<'r, 'h, I: Into>>( &'r self, - haystack: &'t [u8], - ) -> TryFindOverlappingMatches<'r, 't, A, P> { - TryFindOverlappingMatches::new(self, haystack) + input: I, + ) -> FindMatches<'r, 'h, A> { + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, it } } } /// Lower level fallible search routines that permit controlling where the /// search starts and ends in a particular sequence. -impl Regex { - /// Returns true if and only if this regex matches the given haystack. - /// - /// This routine may short circuit if it knows that scanning future input - /// will never lead to a different result. In particular, if the underlying - /// DFA enters a match state or a dead state, then this routine will return - /// `true` or `false`, respectively, without inspecting any future input. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used, Unicode word boundaries are heuristically - /// enabled or limits are set on the number of times the lazy DFA's cache - /// may be cleared. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`is_match_at`](Regex::is_match_at). - pub fn try_is_match_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result { - self.forward() - .find_earliest_fwd_at( - self.scanner().as_mut(), - None, - haystack, - start, - end, - ) - .map(|x| x.is_some()) - } - - /// Returns the first position at which a match is found. - /// - /// This routine stops scanning input in precisely the same circumstances - /// as `is_match`. The key difference is that this routine returns the - /// position at which it stopped scanning input if and only if a match - /// was found. If no match is found, then `None` is returned. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_earliest_at`](Regex::find_earliest_at). - pub fn try_find_earliest_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result, MatchError> { - self.try_find_earliest_at_imp( - self.scanner().as_mut(), - haystack, - start, - end, - ) - } - - /// The implementation of "earliest" searching, where a prefilter scanner - /// may be given. - fn try_find_earliest_at_imp( - &self, - pre: Option<&mut prefilter::Scanner>, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result, MatchError> { - // N.B. We use `&&A` here to call `Automaton` methods, which ensures - // that we always use the `impl Automaton for &A` for calling methods. - // Since this is the usual way that automata are used, this helps - // reduce the number of monomorphized copies of the search code. - let (fwd, rev) = (self.forward(), self.reverse()); - let end = match (&fwd) - .find_earliest_fwd_at(pre, None, haystack, start, end)? - { - None => return Ok(None), - Some(end) => end, - }; - // N.B. The only time we need to tell the reverse searcher the pattern - // to match is in the overlapping case, since it's ambiguous. In the - // leftmost case, I have tentatively convinced myself that it isn't - // necessary and the reverse search will always find the same pattern - // to match as the forward search. But I lack a rigorous proof. - let start = (&rev) - .find_earliest_rev_at(None, haystack, start, end.offset())? - .expect("reverse search must match if forward search does"); - assert_eq!( - start.pattern(), - end.pattern(), - "forward and reverse search must match same pattern" - ); - assert!(start.offset() <= end.offset()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) - } - +impl Regex { /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// - /// # Searching a substring of the haystack + /// This is like [`Regex::find`] but with two differences: /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. + /// 1. It is not generic over `Into` and instead accepts a + /// `&Input`. This permits reusing the same `Input` for multiple searches + /// without needing to create a new one. This _may_ help with latency. + /// 2. It returns an error if the search could not complete where as + /// [`Regex::find`] will panic. /// /// # Errors /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. + /// This routine errors if the search could not complete. This can occur + /// in the following circumstances: /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. + /// * The configuration of the DFA may permit it to "quit" the search. + /// For example, setting quit bytes or enabling heuristic support for + /// Unicode word boundaries. The default configuration does not enable any + /// option that could result in the DFA quitting. + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. /// - /// The infallible (panics on error) version of this routine is - /// [`find_leftmost_at`](Regex::find_leftmost_at). - pub fn try_find_leftmost_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result, MatchError> { - self.try_find_leftmost_at_imp( - self.scanner().as_mut(), - haystack, - start, - end, - ) - } - - /// The implementation of leftmost searching, where a prefilter scanner - /// may be given. - fn try_find_leftmost_at_imp( + /// When a search returns an error, callers cannot know whether a match + /// exists or not. + #[inline] + pub fn try_search( &self, - scanner: Option<&mut prefilter::Scanner>, - haystack: &[u8], - start: usize, - end: usize, - ) -> Result, MatchError> { - // N.B. We use `&&A` here to call `Automaton` methods, which ensures - // that we always use the `impl Automaton for &A` for calling methods. - // Since this is the usual way that automata are used, this helps - // reduce the number of monomorphized copies of the search code. + input: &Input<'_>, + ) -> Result, MatchError> { let (fwd, rev) = (self.forward(), self.reverse()); - let end = match (&fwd) - .find_leftmost_fwd_at(scanner, None, haystack, start, end)? - { + let end = match fwd.try_search_fwd(input)? { None => return Ok(None), Some(end) => end, }; - // N.B. The only time we need to tell the reverse searcher the pattern - // to match is in the overlapping case, since it's ambiguous. In the - // leftmost case, I have tentatively convinced myself that it isn't - // necessary and the reverse search will always find the same pattern - // to match as the forward search. But I lack a rigorous proof. Why not - // just provide the pattern anyway? Well, if it is needed, then leaving - // it out gives us a chance to find a witness. - let start = (&rev) - .find_leftmost_rev_at(None, haystack, start, end.offset())? + // This special cases an empty match at the beginning of the search. If + // our end matches our start, then since a reverse DFA can't match past + // the start, it must follow that our starting position is also our end + // position. So short circuit and skip the reverse search. + if input.start() == end.offset() { + return Ok(Some(Match::new( + end.pattern(), + end.offset()..end.offset(), + ))); + } + // We can also skip the reverse search if we know our search was + // anchored. This occurs either when the input config is anchored or + // when we know the regex itself is anchored. In this case, we know the + // start of the match, if one is found, must be the start of the + // search. + if self.is_anchored(input) { + return Ok(Some(Match::new( + end.pattern(), + input.start()..end.offset(), + ))); + } + // N.B. I have tentatively convinced myself that it isn't necessary + // to specify the specific pattern for the reverse search since the + // reverse search will always find the same pattern to match as the + // forward search. But I lack a rigorous proof. Why not just provide + // the pattern anyway? Well, if it is needed, then leaving it out + // gives us a chance to find a witness. (Also, if we don't need to + // specify the pattern, then we don't need to build the reverse DFA + // with 'starts_for_each_pattern' enabled.) + // + // We also need to be careful to disable 'earliest' for the reverse + // search, since it could be enabled for the forward search. In the + // reverse case, to satisfy "leftmost" criteria, we need to match + // as much as we can. We also need to be careful to make the search + // anchored. We don't want the reverse search to report any matches + // other than the one beginning at the end of our forward search. + let revsearch = input + .clone() + .span(input.start()..end.offset()) + .anchored(Anchored::Yes) + .earliest(false); + let start = rev + .try_search_rev(&revsearch)? .expect("reverse search must match if forward search does"); assert_eq!( start.pattern(), @@ -1250,132 +531,22 @@ impl Regex { "forward and reverse search must match same pattern", ); assert!(start.offset() <= end.offset()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) - } - - /// Search for the first overlapping match within a given range of - /// `haystack`. - /// - /// This routine is principally useful when searching for multiple patterns - /// on inputs where multiple patterns may match the same regions of text. - /// In particular, callers must preserve the automaton's search state from - /// prior calls so that the implementation knows where the last match - /// occurred and which pattern was reported. - /// - /// # Searching a substring of the haystack - /// - /// Being an "at" search routine, this permits callers to search a - /// substring of `haystack` by specifying a range in `haystack`. - /// Why expose this as an API instead of just asking callers to use - /// `&input[start..end]`? The reason is that regex matching often wants - /// to take the surrounding context into account in order to handle - /// look-around (`^`, `$` and `\b`). - /// - /// This is useful when implementing an iterator over matches - /// within the same haystack, which cannot be done correctly by simply - /// providing a subslice of `haystack`. - /// - /// # Errors - /// - /// This routine only errors if the search could not complete. For - /// DFA-based regexes, this only occurs in a non-default configuration - /// where quit bytes are used or Unicode word boundaries are heuristically - /// enabled. - /// - /// When a search cannot complete, callers cannot know whether a match - /// exists or not. - /// - /// The infallible (panics on error) version of this routine is - /// [`find_overlapping_at`](Regex::find_overlapping_at). - pub fn try_find_overlapping_at( - &self, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result, MatchError> { - self.try_find_overlapping_at_imp( - self.scanner().as_mut(), - haystack, - start, - end, - state, - ) + Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) } - /// The implementation of overlapping search at a given range in - /// `haystack`, where `scanner` is a prefilter (if active) and `state` is - /// the current state of the search. - fn try_find_overlapping_at_imp( - &self, - scanner: Option<&mut prefilter::Scanner>, - haystack: &[u8], - start: usize, - end: usize, - state: &mut OverlappingState, - ) -> Result, MatchError> { - // N.B. We use `&&A` here to call `Automaton` methods, which ensures - // that we always use the `impl Automaton for &A` for calling methods. - // Since this is the usual way that automata are used, this helps - // reduce the number of monomorphized copies of the search code. - let (fwd, rev) = (self.forward(), self.reverse()); - // TODO: Decide whether it's worth making this assert work. It doesn't - // work currently because 'has_starts_for_each_pattern' isn't on the - // Automaton trait. Without this assert, we still get a panic, but it's - // a bit more inscrutable. - // assert!( - // rev.has_starts_for_each_pattern(), - // "overlapping searches require that the reverse DFA is \ - // compiled with the 'starts_for_each_pattern' option", - // ); - let end = match (&fwd).find_overlapping_fwd_at( - scanner, None, haystack, start, end, state, - )? { - None => return Ok(None), - Some(end) => end, - }; - // Unlike the leftmost cases, the reverse overlapping search may match - // a different pattern than the forward search. See test failures when - // using `None` instead of `Some(end.pattern())` below. Thus, we must - // run our reverse search using the pattern that matched in the forward - // direction. - let start = (&rev) - .find_leftmost_rev_at( - Some(end.pattern()), - haystack, - 0, - end.offset(), - )? - .expect("reverse search must match if forward search does"); - assert!(start.offset() <= end.offset()); - assert_eq!(start.pattern(), end.pattern()); - Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + /// Returns true if either the given input specifies an anchored search + /// or if the underlying DFA is always anchored. + fn is_anchored(&self, input: &Input<'_>) -> bool { + match input.get_anchored() { + Anchored::No => self.forward().is_always_start_anchored(), + Anchored::Yes | Anchored::Pattern(_) => true, + } } } /// Non-search APIs for querying information about the regex and setting a /// prefilter. -impl Regex { - /// Attach the given prefilter to this regex. - pub fn with_prefilter(self, prefilter: Q) -> Regex { - Regex { - prefilter: Some(prefilter), - forward: self.forward, - reverse: self.reverse, - utf8: self.utf8, - } - } - - /// Remove any prefilter from this regex. - pub fn without_prefilter(self) -> Regex { - Regex { - prefilter: None, - forward: self.forward, - reverse: self.reverse, - utf8: self.utf8, - } - } - +impl Regex { /// Return the underlying DFA responsible for forward matching. /// /// This is useful for accessing the underlying DFA and converting it to @@ -1399,471 +570,48 @@ impl Regex { /// # Example /// /// ``` - /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::dfa::regex::Regex; /// /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; - /// assert_eq!(3, re.pattern_count()); + /// assert_eq!(3, re.pattern_len()); /// # Ok::<(), Box>(()) /// ``` - pub fn pattern_count(&self) -> usize { - assert_eq!( - self.forward().pattern_count(), - self.reverse().pattern_count() - ); - self.forward().pattern_count() - } - - /// Convenience function for returning this regex's prefilter as a trait - /// object. - /// - /// If this regex doesn't have a prefilter, then `None` is returned. - pub fn prefilter(&self) -> Option<&dyn Prefilter> { - match self.prefilter { - None => None, - Some(ref x) => Some(&*x), - } - } - - /// Convenience function for returning a prefilter scanner. - fn scanner(&self) -> Option { - self.prefilter().map(prefilter::Scanner::new) + pub fn pattern_len(&self) -> usize { + assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); + self.forward().pattern_len() } } -/// An iterator over all non-overlapping earliest matches for a particular -/// infallible search. +/// An iterator over all non-overlapping matches for an infallible search. /// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. +/// The iterator yields a [`Match`] value until no more matches could be found. +/// If the underlying regex engine returns an error, then a panic occurs. /// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: +/// The type parameters are as follows: /// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct FindEarliestMatches<'r, 't, A, P>( - TryFindEarliestMatches<'r, 't, A, P>, -); - -impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { - fn new( - re: &'r Regex, - text: &'t [u8], - ) -> FindEarliestMatches<'r, 't, A, P> { - FindEarliestMatches(TryFindEarliestMatches::new(re, text)) - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for FindEarliestMatches<'r, 't, A, P> -{ - type Item = MultiMatch; - - fn next(&mut self) -> Option { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all non-overlapping leftmost matches for a particular -/// infallible search. +/// * `A` represents the type of the underlying DFA that implements the +/// [`Automaton`] trait. /// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. +/// The lifetime parameters are as follows: /// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: +/// * `'h` represents the lifetime of the haystack being searched. +/// * `'r` represents the lifetime of the regex object itself. /// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct FindLeftmostMatches<'r, 't, A, P>( - TryFindLeftmostMatches<'r, 't, A, P>, -); - -impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { - fn new( - re: &'r Regex, - text: &'t [u8], - ) -> FindLeftmostMatches<'r, 't, A, P> { - FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for FindLeftmostMatches<'r, 't, A, P> -{ - type Item = MultiMatch; - - fn next(&mut self) -> Option { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all overlapping matches for a particular infallible -/// search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. -/// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( - TryFindOverlappingMatches<'r, 't, A, P>, -); - -impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { - fn new( - re: &'r Regex, - text: &'t [u8], - ) -> FindOverlappingMatches<'r, 't, A, P> { - FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for FindOverlappingMatches<'r, 't, A, P> -{ - type Item = MultiMatch; - - fn next(&mut self) -> Option { - next_unwrap(self.0.next()) - } -} - -/// An iterator over all non-overlapping earliest matches for a particular -/// fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct TryFindEarliestMatches<'r, 't, A, P> { - re: &'r Regex, - scanner: Option>, - text: &'t [u8], - last_end: usize, - last_match: Option, -} - -impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { - fn new( - re: &'r Regex, - text: &'t [u8], - ) -> TryFindEarliestMatches<'r, 't, A, P> { - let scanner = re.scanner(); - TryFindEarliestMatches { - re, - scanner, - text, - last_end: 0, - last_match: None, - } - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for TryFindEarliestMatches<'r, 't, A, P> -{ - type Item = Result; - - fn next(&mut self) -> Option> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_earliest_at_imp( - self.scanner.as_mut(), - self.text, - self.last_end, - self.text.len(), - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.is_empty() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = if self.re.utf8 { - crate::util::next_utf8(self.text, m.end()) - } else { - m.end() + 1 - }; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(m)) - } -} - -/// An iterator over all non-overlapping leftmost matches for a particular -/// fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct TryFindLeftmostMatches<'r, 't, A, P> { - re: &'r Regex, - scanner: Option>, - text: &'t [u8], - last_end: usize, - last_match: Option, -} - -impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { - fn new( - re: &'r Regex, - text: &'t [u8], - ) -> TryFindLeftmostMatches<'r, 't, A, P> { - let scanner = re.scanner(); - TryFindLeftmostMatches { - re, - scanner, - text, - last_end: 0, - last_match: None, - } - } -} - -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for TryFindLeftmostMatches<'r, 't, A, P> -{ - type Item = Result; - - fn next(&mut self) -> Option> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_leftmost_at_imp( - self.scanner.as_mut(), - self.text, - self.last_end, - self.text.len(), - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - if m.is_empty() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = if self.re.utf8 { - crate::util::next_utf8(self.text, m.end()) - } else { - m.end() + 1 - }; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } - } else { - self.last_end = m.end(); - } - self.last_match = Some(m.end()); - Some(Ok(m)) - } -} - -/// An iterator over all overlapping matches for a particular fallible search. -/// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. -/// -/// `A` is the type used to represent the underlying DFAs used by the regex, -/// while `P` is the type of prefilter used, if any. The lifetime variables are -/// as follows: -/// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'t` is the lifetime of the text being searched. -#[derive(Clone, Debug)] -pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { - re: &'r Regex, - scanner: Option>, - text: &'t [u8], - last_end: usize, - state: OverlappingState, -} - -impl<'r, 't, A: Automaton, P: Prefilter> - TryFindOverlappingMatches<'r, 't, A, P> -{ - fn new( - re: &'r Regex, - text: &'t [u8], - ) -> TryFindOverlappingMatches<'r, 't, A, P> { - let scanner = re.scanner(); - TryFindOverlappingMatches { - re, - scanner, - text, - last_end: 0, - state: OverlappingState::start(), - } - } +/// This iterator can be created with the [`Regex::find_iter`] method. +#[derive(Debug)] +pub struct FindMatches<'r, 'h, A> { + re: &'r Regex, + it: iter::Searcher<'h>, } -impl<'r, 't, A: Automaton, P: Prefilter> Iterator - for TryFindOverlappingMatches<'r, 't, A, P> -{ - type Item = Result; +impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> { + type Item = Match; - fn next(&mut self) -> Option> { - if self.last_end > self.text.len() { - return None; - } - let result = self.re.try_find_overlapping_at_imp( - self.scanner.as_mut(), - self.text, - self.last_end, - self.text.len(), - &mut self.state, - ); - let m = match result { - Err(err) => return Some(Err(err)), - Ok(None) => return None, - Ok(Some(m)) => m, - }; - // Unlike the non-overlapping case, we're OK with empty matches at this - // level. In particular, the overlapping search algorithm is itself - // responsible for ensuring that progress is always made. - self.last_end = m.end(); - Some(Ok(m)) - } -} - -/// The configuration used for compiling a DFA-backed regex. -/// -/// A regex configuration is a simple data object that is typically used with -/// [`Builder::configure`]. -#[cfg(feature = "alloc")] -#[derive(Clone, Copy, Debug, Default)] -pub struct Config { - utf8: Option, -} - -#[cfg(feature = "alloc")] -impl Config { - /// Return a new default regex compiler configuration. - pub fn new() -> Config { - Config::default() - } - - /// Whether to enable UTF-8 mode or not. - /// - /// When UTF-8 mode is enabled (the default) and an empty match is seen, - /// the iterators on [`Regex`] will always start the next search at the - /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 - /// mode is disabled, such searches are begun at the next byte offset. - /// - /// If this mode is enabled and invalid UTF-8 is given to search, then - /// behavior is unspecified. - /// - /// Generally speaking, one should enable this when - /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) - /// and - /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) - /// are enabled, and disable it otherwise. - /// - /// # Example - /// - /// This example demonstrates the differences between when this option is - /// enabled and disabled. The differences only arise when the regex can - /// return matches of length zero. - /// - /// In this first snippet, we show the results when UTF-8 mode is disabled. - /// - /// ``` - /// use regex_automata::{dfa::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(false)) - /// .build(r"")?; - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box>(()) - /// ``` - /// - /// And in this snippet, we execute the same search on the same haystack, - /// but with UTF-8 mode enabled. Notice that byte offsets that would - /// otherwise split the encoding of `☃` are not returned. - /// - /// ``` - /// use regex_automata::{dfa::regex::Regex, MultiMatch}; - /// - /// let re = Regex::builder() - /// .configure(Regex::config().utf8(true)) - /// .build(r"")?; - /// let haystack = "a☃z".as_bytes(); - /// let mut it = re.find_leftmost_iter(haystack); - /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); - /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); - /// assert_eq!(None, it.next()); - /// - /// # Ok::<(), Box>(()) - /// ``` - pub fn utf8(mut self, yes: bool) -> Config { - self.utf8 = Some(yes); - self - } - - /// Returns true if and only if this configuration has UTF-8 mode enabled. - /// - /// When UTF-8 mode is enabled and an empty match is seen, the iterators on - /// [`Regex`] will always start the next search at the next UTF-8 encoded - /// codepoint. When UTF-8 mode is disabled, such searches are begun at the - /// next byte offset. - pub fn get_utf8(&self) -> bool { - self.utf8.unwrap_or(true) - } - - /// Overwrite the default configuration such that the options in `o` are - /// always used. If an option in `o` is not set, then the corresponding - /// option in `self` is used. If it's not set in `self` either, then it - /// remains not set. - pub(crate) fn overwrite(self, o: Config) -> Config { - Config { utf8: o.utf8.or(self.utf8) } + #[inline] + fn next(&mut self) -> Option { + let FindMatches { re, ref mut it } = *self; + it.advance(|input| re.try_search(input)) } } @@ -1874,17 +622,15 @@ impl Config { /// itself. This builder is different from a general purpose regex builder in /// that it permits fine grain configuration of the construction process. The /// trade off for this is complexity, and the possibility of setting a -/// configuration that might not make sense. For example, there are three +/// configuration that might not make sense. For example, there are two /// different UTF-8 modes: /// -/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the -/// pattern itself can contain sub-expressions that match invalid UTF-8. -/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) -/// controls whether the implicit unanchored prefix added to the NFA can -/// match through invalid UTF-8 or not. -/// * [`Config::utf8`] controls how the regex iterators themselves advance -/// the starting position of the next search when a match with zero length is -/// found. +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls +/// how the regex iterators themselves advance the starting position of the +/// next search when a match with zero length is found. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. @@ -1919,57 +665,51 @@ impl Config { /// /// # Example /// -/// This example shows how to disable UTF-8 mode in the syntax, the NFA and -/// the regex itself. This is generally what you want for matching on -/// arbitrary bytes. +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ -/// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// dfa::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() -/// .configure(Regex::config().utf8(false)) -/// .syntax(SyntaxConfig::new().utf8(false)) +/// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; -/// let expected = Some(MultiMatch::must(0, 1, 9)); -/// let got = re.find_leftmost(haystack); +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find(haystack); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 -/// // on the syntax permits this. Notice also that the -/// // search was unanchored and skipped over invalid UTF-8. -/// // Disabling UTF-8 on the Thompson NFA permits this. -/// // -/// // N.B. This example does not show the impact of -/// // disabling UTF-8 mode on Config, since that -/// // only impacts regexes that can produce matches of -/// // length 0. +/// // on the syntax permits this. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box>(()) /// ``` -#[cfg(feature = "alloc")] #[derive(Clone, Debug)] pub struct Builder { - config: Config, + #[cfg(feature = "dfa-build")] dfa: dense::Builder, } -#[cfg(feature = "alloc")] impl Builder { /// Create a new regex builder with the default configuration. pub fn new() -> Builder { - Builder { config: Config::default(), dfa: dense::Builder::new() } + Builder { + #[cfg(feature = "dfa-build")] + dfa: dense::Builder::new(), + } } /// Build a regex from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. - pub fn build(&self, pattern: &str) -> Result { + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } @@ -1977,38 +717,42 @@ impl Builder { /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_sparse( &self, pattern: &str, - ) -> Result>>, Error> { + ) -> Result>>, BuildError> { self.build_many_sparse(&[pattern]) } /// Build a regex from the given patterns. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_many>( &self, patterns: &[P], - ) -> Result { + ) -> Result { let forward = self.dfa.build_many(patterns)?; let reverse = self .dfa .clone() .configure( dense::Config::new() - .anchored(true) - .match_kind(MatchKind::All) - .starts_for_each_pattern(true), + .prefilter(None) + .specialize_start_states(false) + .start_kind(StartKind::Anchored) + .match_kind(MatchKind::All), ) - .thompson(thompson::Config::new().reverse(true)) + .thompson(crate::nfa::thompson::Config::new().reverse(true)) .build_many(patterns)?; Ok(self.build_from_dfas(forward, reverse)) } /// Build a sparse regex from the given patterns. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_many_sparse>( &self, patterns: &[P], - ) -> Result>>, Error> { + ) -> Result>>, BuildError> { let re = self.build_many(patterns)?; let forward = re.forward().to_sparse()?; let reverse = re.reverse().to_sparse()?; @@ -2028,16 +772,14 @@ impl Builder { /// * It should be anchored. /// * It should use [`MatchKind::All`] semantics. /// * It should match in reverse. - /// * It should have anchored start states compiled for each pattern. /// * Otherwise, its configuration should match the forward DFA. /// - /// If these conditions are satisfied, then behavior of searches is + /// If these conditions aren't satisfied, then the behavior of searches is /// unspecified. /// - /// Note that when using this constructor, only the configuration from - /// [`Config`] is applied. The only configuration settings on this builder - /// only apply when the builder owns the construction of the DFAs - /// themselves. + /// Note that when using this constructor, no configuration is applied. + /// Since this routine provides the DFAs to the builder, there is no + /// opportunity to apply other configuration options. /// /// # Example /// @@ -2079,35 +821,33 @@ impl Builder { forward: A, reverse: A, ) -> Regex { - let utf8 = self.config.get_utf8(); - Regex { prefilter: None, forward, reverse, utf8 } - } - - /// Apply the given regex configuration options to this builder. - pub fn configure(&mut self, config: Config) -> &mut Builder { - self.config = self.config.overwrite(config); - self + Regex { forward, reverse } } /// Set the syntax configuration for this builder using - /// [`SyntaxConfig`](crate::SyntaxConfig). + /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. + #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn syntax( &mut self, - config: crate::util::syntax::SyntaxConfig, + config: crate::util::syntax::Config, ) -> &mut Builder { self.dfa.syntax(config); self } /// Set the Thompson NFA configuration for this builder using - /// [`nfa::thompson::Config`](thompson::Config). + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like whether additional time should be /// spent shrinking the size of the NFA. - pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + #[cfg(all(feature = "syntax", feature = "dfa-build"))] + pub fn thompson( + &mut self, + config: crate::nfa::thompson::Config, + ) -> &mut Builder { self.dfa.thompson(config); self } @@ -2117,30 +857,15 @@ impl Builder { /// /// This permits setting things like whether the underlying DFAs should /// be minimized. + #[cfg(feature = "dfa-build")] pub fn dense(&mut self, config: dense::Config) -> &mut Builder { self.dfa.configure(config); self } } -#[cfg(feature = "alloc")] impl Default for Builder { fn default() -> Builder { Builder::new() } } - -#[inline(always)] -fn next_unwrap( - item: Option>, -) -> Option { - match item { - None => None, - Some(Ok(m)) => Some(m), - Some(Err(err)) => panic!( - "unexpected regex search error: {}\n\ - to handle search errors, use try_ methods", - err, - ), - } -} -- cgit v1.2.3