From 4547b622d8d29df964fa2914213088b148c498fc Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:18:32 +0200 Subject: Merging upstream version 1.67.1+dfsg1. Signed-off-by: Daniel Baumann --- vendor/regex-automata/src/dfa/regex.rs | 2146 ++++++++++++++++++++++++++++++++ 1 file changed, 2146 insertions(+) create mode 100644 vendor/regex-automata/src/dfa/regex.rs (limited to 'vendor/regex-automata/src/dfa/regex.rs') diff --git a/vendor/regex-automata/src/dfa/regex.rs b/vendor/regex-automata/src/dfa/regex.rs new file mode 100644 index 000000000..d0917e17d --- /dev/null +++ b/vendor/regex-automata/src/dfa/regex.rs @@ -0,0 +1,2146 @@ +/*! +A DFA-backed `Regex`. + +This module provides [`Regex`], which is defined generically over the +[`Automaton`] trait. A `Regex` implements convenience routines you might have +come to expect, such as finding the start/end of a match and iterating over +all non-overlapping matches. This `Regex` type is limited in its capabilities +to what a DFA can provide. Therefore, APIs involving capturing groups, for +example, are not provided. + +Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that +finds the end offset of a match, where as the other is a "reverse" DFA that +find the start offset of a match. + +See the [parent module](crate::dfa) for examples. +*/ + +#[cfg(feature = "alloc")] +use alloc::vec::Vec; + +use crate::{ + dfa::automaton::{Automaton, OverlappingState}, + util::prefilter::{self, Prefilter}, + MatchError, MultiMatch, +}; +#[cfg(feature = "alloc")] +use crate::{ + dfa::{dense, error::Error, sparse}, + nfa::thompson, + util::matchtypes::MatchKind, +}; + +// When the alloc feature is enabled, the regex type sets its A type parameter +// to default to an owned dense DFA. But without alloc, we set no default. This +// makes things a lot more convenient in the common case, since writing out the +// DFA types is pretty annoying. +// +// Since we have two different definitions but only want to write one doc +// string, we use a macro to capture the doc and other attributes once and then +// repeat them for each definition. +macro_rules! define_regex_type { + ($(#[$doc:meta])*) => { + #[cfg(feature = "alloc")] + $(#[$doc])* + pub struct Regex { + prefilter: Option

, + forward: A, + reverse: A, + utf8: bool, + } + + #[cfg(not(feature = "alloc"))] + $(#[$doc])* + pub struct Regex { + prefilter: Option

, + forward: A, + reverse: A, + utf8: bool, + } + }; +} + +define_regex_type!( + /// A regular expression that uses deterministic finite automata for fast + /// searching. + /// + /// A regular expression is comprised of two DFAs, a "forward" DFA and a + /// "reverse" DFA. The forward DFA is responsible for detecting the end of + /// a match while the reverse DFA is responsible for detecting the start + /// of a match. Thus, in order to find the bounds of any given match, a + /// forward search must first be run followed by a reverse search. A match + /// found by the forward DFA guarantees that the reverse DFA will also find + /// a match. + /// + /// The type of the DFA used by a `Regex` corresponds to the `A` type + /// parameter, which must satisfy the [`Automaton`] trait. Typically, + /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a + /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more + /// memory but search faster, while sparse DFAs use less memory but search + /// more slowly. + /// + /// By default, a regex's automaton type parameter is set to + /// `dense::DFA>` when the `alloc` feature is enabled. For most + /// in-memory work loads, this is the most convenient type that gives the + /// best search performance. When the `alloc` feature is disabled, no + /// default type is used. + /// + /// A `Regex` also has a `P` type parameter, which is used to select the + /// prefilter used during search. By default, no prefilter is enabled by + /// setting the type to default to [`prefilter::None`]. A prefilter can be + /// enabled by using the [`Regex::prefilter`] method. + /// + /// # When should I use this? + /// + /// Generally speaking, if you can afford the overhead of building a full + /// DFA for your regex, and you don't need things like capturing groups, + /// then this is a good choice if you're looking to optimize for matching + /// speed. Note however that its speed may be worse than a general purpose + /// regex engine if you don't select a good [prefilter]. + /// + /// # Earliest vs Leftmost vs Overlapping + /// + /// The search routines exposed on a `Regex` reflect three different ways + /// of searching: + /// + /// * "earliest" means to stop as soon as a match has been detected. + /// * "leftmost" means to continue matching until the underlying + /// automaton cannot advance. This reflects "standard" searching you + /// might be used to in other regex engines. e.g., This permits + /// non-greedy and greedy searching to work as you would expect. + /// * "overlapping" means to find all possible matches, even if they + /// overlap. + /// + /// Generally speaking, when doing an overlapping search, you'll want to + /// build your regex DFAs with [`MatchKind::All`] semantics. Using + /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is + /// likely to lead to odd behavior since `LeftmostFirst` specifically omits + /// some matches that can never be reported due to its semantics. + /// + /// The following example shows the differences between how these different + /// types of searches impact looking for matches of `[a-z]+` in the + /// haystack `abc`. + /// + /// ``` + /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; + /// + /// let pattern = r"[a-z]+"; + /// let haystack = "abc".as_bytes(); + /// + /// // With leftmost-first semantics, we test "earliest" and "leftmost". + /// let re = dfa::regex::Builder::new() + /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) + /// .build(pattern)?; + /// + /// // "earliest" searching isn't impacted by greediness + /// let mut it = re.find_earliest_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// // "leftmost" searching supports greediness (and non-greediness) + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// // For overlapping, we want "all" match kind semantics. + /// let re = dfa::regex::Builder::new() + /// .dense(dense::Config::new().match_kind(MatchKind::All)) + /// .build(pattern)?; + /// + /// // In the overlapping search, we find all three possible matches + /// // starting at the beginning of the haystack. + /// let mut it = re.find_overlapping_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Sparse DFAs + /// + /// Since a `Regex` is generic over the [`Automaton`] trait, it can be + /// used with any kind of DFA. While this crate constructs dense DFAs by + /// default, it is easy enough to build corresponding sparse DFAs, and then + /// build a regex from them: + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// // First, build a regex that uses dense DFAs. + /// let dense_re = Regex::new("foo[0-9]+")?; + /// + /// // Second, build sparse DFAs from the forward and reverse dense DFAs. + /// let fwd = dense_re.forward().to_sparse()?; + /// let rev = dense_re.reverse().to_sparse()?; + /// + /// // Third, build a new regex from the constituent sparse DFAs. + /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); + /// + /// // A regex that uses sparse DFAs can be used just like with dense DFAs. + /// assert_eq!(true, sparse_re.is_match(b"foo123")); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// Alternatively, one can use a [`Builder`] to construct a sparse DFA + /// more succinctly. (Note though that dense DFAs are still constructed + /// first internally, and then converted to sparse DFAs, as in the example + /// above.) + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; + /// // A regex that uses sparse DFAs can be used just like with dense DFAs. + /// assert!(sparse_re.is_match(b"foo123")); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Fallibility + /// + /// In non-default configurations, the DFAs generated in this module may + /// return an error during a search. (Currently, the only way this happens + /// is if quit bytes are added or Unicode word boundaries are heuristically + /// enabled, both of which are turned off by default.) For convenience, the + /// main search routines, like [`find_leftmost`](Regex::find_leftmost), + /// will panic if an error occurs. However, if you need to use DFAs + /// which may produce an error at search time, then there are fallible + /// equivalents of all search routines. For example, for `find_leftmost`, + /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). + /// The routines prefixed with `try_` return `Result, + /// MatchError>`, where as the infallible routines simply return + /// `Option`. + /// + /// # Example + /// + /// This example shows how to cause a search to terminate if it sees a + /// `\n` byte, and handle the error returned. This could be useful if, for + /// example, you wanted to prevent a user supplied pattern from matching + /// across a line boundary. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().quit(b'\n', true)) + /// .build(r"foo\p{any}+bar")?; + /// + /// let haystack = "foo\nbar".as_bytes(); + /// // Normally this would produce a match, since \p{any} contains '\n'. + /// // But since we instructed the automaton to enter a quit state if a + /// // '\n' is observed, this produces a match error instead. + /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; + /// let got = re.try_find_leftmost(haystack).unwrap_err(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[derive(Clone, Debug)] +); + +#[cfg(feature = "alloc")] +impl Regex { + /// Parse the given regular expression using the default configuration and + /// return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new(pattern: &str) -> Result { + Builder::new().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; + /// + /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new_many>(patterns: &[P]) -> Result { + Builder::new().build_many(patterns) + } +} + +#[cfg(feature = "alloc")] +impl Regex>> { + /// Parse the given regular expression using the default configuration, + /// except using sparse DFAs, and return the corresponding regex. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_sparse("foo[0-9]+bar")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 14)), + /// re.find_leftmost(b"zzzfoo12345barzzz"), + /// ); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new_sparse( + pattern: &str, + ) -> Result>>, Error> { + Builder::new().build_sparse(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "regex set" + /// using sparse DFAs. This otherwise similarly uses the default regex + /// configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; + /// + /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); + /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box>(()) + /// ``` + pub fn new_many_sparse>( + patterns: &[P], + ) -> Result>>, Error> { + Builder::new().build_many_sparse(patterns) + } +} + +/// Convenience routines for regex construction. +#[cfg(feature = "alloc")] +impl Regex { + /// Return a default configuration for a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a regex. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode for `Regex` iteration. + /// When UTF-8 mode is disabled, the position immediately following an + /// empty match is where the next search begins, instead of the next + /// position of a UTF-8 encoded codepoint. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a `Regex`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// dfa::regex::Regex, + /// nfa::thompson, + /// MultiMatch, SyntaxConfig, + /// }; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .syntax(SyntaxConfig::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(MultiMatch::must(0, 1, 9)); + /// let got = re.find_leftmost(haystack); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } +} + +/// Standard search routines for finding and iterating over matches. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_is_match`](Regex::try_is_match). + /// + /// # Example + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let re = Regex::new("foo[0-9]+bar")?; + /// assert_eq!(true, re.is_match(b"foo12345bar")); + /// assert_eq!(false, re.is_match(b"foobar")); + /// # Ok::<(), Box>(()) + /// ``` + pub fn is_match(&self, haystack: &[u8]) -> bool { + self.is_match_at(haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest`](Regex::try_find_earliest). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// // Normally, the leftmost first match would greedily consume as many + /// // decimal digits as it could. But a match is detected as soon as one + /// // digit is seen. + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 0, 4)), + /// re.find_earliest(b"foo12345"), + /// ); + /// + /// // Normally, the end of the leftmost first match here would be 3, + /// // but the "earliest" match semantics detect a match earlier. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc")); + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_earliest(&self, haystack: &[u8]) -> Option { + self.find_earliest_at(haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost`](Regex::try_find_leftmost). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// // Greediness is applied appropriately when compared to find_earliest. + /// let re = Regex::new("foo[0-9]+")?; + /// assert_eq!( + /// Some(MultiMatch::must(0, 3, 11)), + /// re.find_leftmost(b"zzzfoo12345zzz"), + /// ); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the default leftmost-first match semantics demand that we find the + /// // earliest match that prefers earlier parts of the pattern over latter + /// // parts. + /// let re = Regex::new("abc|a")?; + /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc")); + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_leftmost(&self, haystack: &[u8]) -> Option { + self.find_leftmost_at(haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping`](Regex::try_find_overlapping). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// let mut state = dfa::OverlappingState::start(); + /// + /// let expected = Some(MultiMatch::must(1, 0, 4)); + /// let got = re.find_overlapping(haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// // The first pattern also matches at the same position, so re-running + /// // the search will yield another match. Notice also that the first + /// // pattern is returned after the second. This is because the second + /// // pattern begins its match before the first, is therefore an earlier + /// // match and is thus reported first. + /// let expected = Some(MultiMatch::must(0, 1, 4)); + /// let got = re.find_overlapping(haystack, &mut state); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_overlapping( + &self, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Option { + self.find_overlapping_at(haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). + /// + /// # Example + /// + /// This example shows how to run an "earliest" iterator. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::new("[0-9]+")?; + /// let haystack = "123".as_bytes(); + /// + /// // Normally, a standard leftmost iterator would return a single + /// // match, but since "earliest" detects matches earlier, we get + /// // three matches. + /// let mut it = re.find_earliest_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_earliest_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindEarliestMatches<'r, 't, A, P> { + FindEarliestMatches::new(self, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new("foo[0-9]+")?; + /// let text = b"foo1 foo12 foo123"; + /// let matches: Vec = re.find_leftmost_iter(text).collect(); + /// assert_eq!(matches, vec![ + /// MultiMatch::must(0, 0, 4), + /// MultiMatch::must(0, 5, 10), + /// MultiMatch::must(0, 11, 17), + /// ]); + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_leftmost_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindLeftmostMatches<'r, 't, A, P> { + FindLeftmostMatches::new(self, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Panics + /// + /// If the underlying DFAs return an error during iteration, then iteration + /// panics. This only occurs in non-default configurations where quit bytes + /// are used or Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). + /// + /// # Example + /// + /// This example shows how to run an overlapping search with multiple + /// regexes. + /// + /// ``` + /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; + /// + /// let re = Regex::builder() + /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) + /// .build_many(&[r"\w+$", r"\S+$"])?; + /// let haystack = "@foo".as_bytes(); + /// + /// let mut it = re.find_overlapping_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn find_overlapping_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> FindOverlappingMatches<'r, 't, A, P> { + FindOverlappingMatches::new(self, haystack) + } +} + +/// Lower level infallible search routines that permit controlling where +/// the search starts and ends in a particular sequence. This is useful for +/// executing searches that need to take surrounding context into account. This +/// is required for correctly implementing iteration because of look-around +/// operators (`^`, `$`, `\b`). +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_is_match_at`](Regex::try_is_match_at). + pub fn is_match_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> bool { + self.try_is_match_at(haystack, start, end).unwrap() + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_earliest_at`](Regex::try_find_earliest_at). + pub fn find_earliest_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option { + self.try_find_earliest_at(haystack, start, end).unwrap() + } + + /// Returns the same as `find_leftmost`, but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, if the DFA is anchored, then + /// a match can only occur when `start == 0`. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches within the + /// same haystack, which cannot be done correctly by simply providing a + /// subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). + pub fn find_leftmost_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Option { + self.try_find_leftmost_at(haystack, start, end).unwrap() + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Panics + /// + /// If the underlying DFAs return an error, then this routine panics. This + /// only occurs in non-default configurations where quit bytes are used or + /// Unicode word boundaries are heuristically enabled. + /// + /// The fallible version of this routine is + /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). + pub fn find_overlapping_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Option { + self.try_find_overlapping_at(haystack, start, end, state).unwrap() + } +} + +/// Fallible search routines. These may return an error when the underlying +/// DFAs have been configured in a way that permits them to fail during a +/// search. +/// +/// Errors during search only occur when the DFA has been explicitly +/// configured to do so, usually by specifying one or more "quit" bytes or by +/// heuristically enabling Unicode word boundaries. +/// +/// Errors will never be returned using the default configuration. So these +/// fallible routines are only needed for particular configurations. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match`](Regex::is_match). + pub fn try_is_match(&self, haystack: &[u8]) -> Result { + self.try_is_match_at(haystack, 0, haystack.len()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest`](Regex::find_earliest). + pub fn try_find_earliest( + &self, + haystack: &[u8], + ) -> Result, MatchError> { + self.try_find_earliest_at(haystack, 0, haystack.len()) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost`](Regex::find_leftmost). + pub fn try_find_leftmost( + &self, + haystack: &[u8], + ) -> Result, MatchError> { + self.try_find_leftmost_at(haystack, 0, haystack.len()) + } + + /// Search for the first overlapping match in `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping`](Regex::find_overlapping). + pub fn try_find_overlapping( + &self, + haystack: &[u8], + state: &mut OverlappingState, + ) -> Result, MatchError> { + self.try_find_overlapping_at(haystack, 0, haystack.len(), state) + } + + /// Returns an iterator over all non-overlapping "earliest" matches. + /// + /// Match positions are reported as soon as a match is known to occur, even + /// if the standard leftmost match would be longer. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_iter`](Regex::find_earliest_iter). + pub fn try_find_earliest_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindEarliestMatches<'r, 't, A, P> { + TryFindEarliestMatches::new(self, haystack) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// This corresponds to the "standard" regex search iterator. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_iter`](Regex::find_leftmost_iter). + pub fn try_find_leftmost_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 't, A, P> { + TryFindLeftmostMatches::new(self, haystack) + } + + /// Returns an iterator over all overlapping matches in the given haystack. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// The iterator takes care of handling the overlapping state that must be + /// threaded through every search. + /// + /// # Errors + /// + /// This iterator only yields errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_iter`](Regex::find_overlapping_iter). + pub fn try_find_overlapping_iter<'r, 't>( + &'r self, + haystack: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 't, A, P> { + TryFindOverlappingMatches::new(self, haystack) + } +} + +/// Lower level fallible search routines that permit controlling where the +/// search starts and ends in a particular sequence. +impl Regex { + /// Returns true if and only if this regex matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future input + /// will never lead to a different result. In particular, if the underlying + /// DFA enters a match state or a dead state, then this routine will return + /// `true` or `false`, respectively, without inspecting any future input. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used, Unicode word boundaries are heuristically + /// enabled or limits are set on the number of times the lazy DFA's cache + /// may be cleared. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`is_match_at`](Regex::is_match_at). + pub fn try_is_match_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result { + self.forward() + .find_earliest_fwd_at( + self.scanner().as_mut(), + None, + haystack, + start, + end, + ) + .map(|x| x.is_some()) + } + + /// Returns the first position at which a match is found. + /// + /// This routine stops scanning input in precisely the same circumstances + /// as `is_match`. The key difference is that this routine returns the + /// position at which it stopped scanning input if and only if a match + /// was found. If no match is found, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_earliest_at`](Regex::find_earliest_at). + pub fn try_find_earliest_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + self.try_find_earliest_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + ) + } + + /// The implementation of "earliest" searching, where a prefilter scanner + /// may be given. + fn try_find_earliest_at_imp( + &self, + pre: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + let end = match (&fwd) + .find_earliest_fwd_at(pre, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. + let start = (&rev) + .find_earliest_rev_at(None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern" + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + /// Returns the start and end offset of the leftmost match. If no match + /// exists, then `None` is returned. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_leftmost_at`](Regex::find_leftmost_at). + pub fn try_find_leftmost_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + self.try_find_leftmost_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + ) + } + + /// The implementation of leftmost searching, where a prefilter scanner + /// may be given. + fn try_find_leftmost_at_imp( + &self, + scanner: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + ) -> Result, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + let end = match (&fwd) + .find_leftmost_fwd_at(scanner, None, haystack, start, end)? + { + None => return Ok(None), + Some(end) => end, + }; + // N.B. The only time we need to tell the reverse searcher the pattern + // to match is in the overlapping case, since it's ambiguous. In the + // leftmost case, I have tentatively convinced myself that it isn't + // necessary and the reverse search will always find the same pattern + // to match as the forward search. But I lack a rigorous proof. Why not + // just provide the pattern anyway? Well, if it is needed, then leaving + // it out gives us a chance to find a witness. + let start = (&rev) + .find_leftmost_rev_at(None, haystack, start, end.offset())? + .expect("reverse search must match if forward search does"); + assert_eq!( + start.pattern(), + end.pattern(), + "forward and reverse search must match same pattern", + ); + assert!(start.offset() <= end.offset()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } + + /// Search for the first overlapping match within a given range of + /// `haystack`. + /// + /// This routine is principally useful when searching for multiple patterns + /// on inputs where multiple patterns may match the same regions of text. + /// In particular, callers must preserve the automaton's search state from + /// prior calls so that the implementation knows where the last match + /// occurred and which pattern was reported. + /// + /// # Searching a substring of the haystack + /// + /// Being an "at" search routine, this permits callers to search a + /// substring of `haystack` by specifying a range in `haystack`. + /// Why expose this as an API instead of just asking callers to use + /// `&input[start..end]`? The reason is that regex matching often wants + /// to take the surrounding context into account in order to handle + /// look-around (`^`, `$` and `\b`). + /// + /// This is useful when implementing an iterator over matches + /// within the same haystack, which cannot be done correctly by simply + /// providing a subslice of `haystack`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For + /// DFA-based regexes, this only occurs in a non-default configuration + /// where quit bytes are used or Unicode word boundaries are heuristically + /// enabled. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// The infallible (panics on error) version of this routine is + /// [`find_overlapping_at`](Regex::find_overlapping_at). + pub fn try_find_overlapping_at( + &self, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result, MatchError> { + self.try_find_overlapping_at_imp( + self.scanner().as_mut(), + haystack, + start, + end, + state, + ) + } + + /// The implementation of overlapping search at a given range in + /// `haystack`, where `scanner` is a prefilter (if active) and `state` is + /// the current state of the search. + fn try_find_overlapping_at_imp( + &self, + scanner: Option<&mut prefilter::Scanner>, + haystack: &[u8], + start: usize, + end: usize, + state: &mut OverlappingState, + ) -> Result, MatchError> { + // N.B. We use `&&A` here to call `Automaton` methods, which ensures + // that we always use the `impl Automaton for &A` for calling methods. + // Since this is the usual way that automata are used, this helps + // reduce the number of monomorphized copies of the search code. + let (fwd, rev) = (self.forward(), self.reverse()); + // TODO: Decide whether it's worth making this assert work. It doesn't + // work currently because 'has_starts_for_each_pattern' isn't on the + // Automaton trait. Without this assert, we still get a panic, but it's + // a bit more inscrutable. + // assert!( + // rev.has_starts_for_each_pattern(), + // "overlapping searches require that the reverse DFA is \ + // compiled with the 'starts_for_each_pattern' option", + // ); + let end = match (&fwd).find_overlapping_fwd_at( + scanner, None, haystack, start, end, state, + )? { + None => return Ok(None), + Some(end) => end, + }; + // Unlike the leftmost cases, the reverse overlapping search may match + // a different pattern than the forward search. See test failures when + // using `None` instead of `Some(end.pattern())` below. Thus, we must + // run our reverse search using the pattern that matched in the forward + // direction. + let start = (&rev) + .find_leftmost_rev_at( + Some(end.pattern()), + haystack, + 0, + end.offset(), + )? + .expect("reverse search must match if forward search does"); + assert!(start.offset() <= end.offset()); + assert_eq!(start.pattern(), end.pattern()); + Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) + } +} + +/// Non-search APIs for querying information about the regex and setting a +/// prefilter. +impl Regex { + /// Attach the given prefilter to this regex. + pub fn with_prefilter(self, prefilter: Q) -> Regex { + Regex { + prefilter: Some(prefilter), + forward: self.forward, + reverse: self.reverse, + utf8: self.utf8, + } + } + + /// Remove any prefilter from this regex. + pub fn without_prefilter(self) -> Regex { + Regex { + prefilter: None, + forward: self.forward, + reverse: self.reverse, + utf8: self.utf8, + } + } + + /// Return the underlying DFA responsible for forward matching. + /// + /// This is useful for accessing the underlying DFA and converting it to + /// some other format or size. See the [`Builder::build_from_dfas`] docs + /// for an example of where this might be useful. + pub fn forward(&self) -> &A { + &self.forward + } + + /// Return the underlying DFA responsible for reverse matching. + /// + /// This is useful for accessing the underlying DFA and converting it to + /// some other format or size. See the [`Builder::build_from_dfas`] docs + /// for an example of where this might be useful. + pub fn reverse(&self) -> &A { + &self.reverse + } + + /// Returns the total number of patterns matched by this regex. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{MultiMatch, dfa::regex::Regex}; + /// + /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; + /// assert_eq!(3, re.pattern_count()); + /// # Ok::<(), Box>(()) + /// ``` + pub fn pattern_count(&self) -> usize { + assert_eq!( + self.forward().pattern_count(), + self.reverse().pattern_count() + ); + self.forward().pattern_count() + } + + /// Convenience function for returning this regex's prefilter as a trait + /// object. + /// + /// If this regex doesn't have a prefilter, then `None` is returned. + pub fn prefilter(&self) -> Option<&dyn Prefilter> { + match self.prefilter { + None => None, + Some(ref x) => Some(&*x), + } + } + + /// Convenience function for returning a prefilter scanner. + fn scanner(&self) -> Option { + self.prefilter().map(prefilter::Scanner::new) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindEarliestMatches<'r, 't, A, P>( + TryFindEarliestMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> FindEarliestMatches<'r, 't, A, P> { + FindEarliestMatches(TryFindEarliestMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindEarliestMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// infallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindLeftmostMatches<'r, 't, A, P>( + TryFindLeftmostMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> FindLeftmostMatches<'r, 't, A, P> { + FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindLeftmostMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all overlapping matches for a particular infallible +/// search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. If the underlying search returns an error, then this panics. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( + TryFindOverlappingMatches<'r, 't, A, P>, +); + +impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> FindOverlappingMatches<'r, 't, A, P> { + FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for FindOverlappingMatches<'r, 't, A, P> +{ + type Item = MultiMatch; + + fn next(&mut self) -> Option { + next_unwrap(self.0.next()) + } +} + +/// An iterator over all non-overlapping earliest matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindEarliestMatches<'r, 't, A, P> { + re: &'r Regex, + scanner: Option>, + text: &'t [u8], + last_end: usize, + last_match: Option, +} + +impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> TryFindEarliestMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindEarliestMatches { + re, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindEarliestMatches<'r, 't, A, P> +{ + type Item = Result; + + fn next(&mut self) -> Option> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_earliest_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all non-overlapping leftmost matches for a particular +/// fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindLeftmostMatches<'r, 't, A, P> { + re: &'r Regex, + scanner: Option>, + text: &'t [u8], + last_end: usize, + last_match: Option, +} + +impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> TryFindLeftmostMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindLeftmostMatches { + re, + scanner, + text, + last_end: 0, + last_match: None, + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindLeftmostMatches<'r, 't, A, P> +{ + type Item = Result; + + fn next(&mut self) -> Option> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_leftmost_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + if m.is_empty() { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + self.last_end = if self.re.utf8 { + crate::util::next_utf8(self.text, m.end()) + } else { + m.end() + 1 + }; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end()) == self.last_match { + return self.next(); + } + } else { + self.last_end = m.end(); + } + self.last_match = Some(m.end()); + Some(Ok(m)) + } +} + +/// An iterator over all overlapping matches for a particular fallible search. +/// +/// The iterator yields a [`MultiMatch`] value until no more matches could be +/// found. +/// +/// `A` is the type used to represent the underlying DFAs used by the regex, +/// while `P` is the type of prefilter used, if any. The lifetime variables are +/// as follows: +/// +/// * `'r` is the lifetime of the regular expression itself. +/// * `'t` is the lifetime of the text being searched. +#[derive(Clone, Debug)] +pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { + re: &'r Regex, + scanner: Option>, + text: &'t [u8], + last_end: usize, + state: OverlappingState, +} + +impl<'r, 't, A: Automaton, P: Prefilter> + TryFindOverlappingMatches<'r, 't, A, P> +{ + fn new( + re: &'r Regex, + text: &'t [u8], + ) -> TryFindOverlappingMatches<'r, 't, A, P> { + let scanner = re.scanner(); + TryFindOverlappingMatches { + re, + scanner, + text, + last_end: 0, + state: OverlappingState::start(), + } + } +} + +impl<'r, 't, A: Automaton, P: Prefilter> Iterator + for TryFindOverlappingMatches<'r, 't, A, P> +{ + type Item = Result; + + fn next(&mut self) -> Option> { + if self.last_end > self.text.len() { + return None; + } + let result = self.re.try_find_overlapping_at_imp( + self.scanner.as_mut(), + self.text, + self.last_end, + self.text.len(), + &mut self.state, + ); + let m = match result { + Err(err) => return Some(Err(err)), + Ok(None) => return None, + Ok(Some(m)) => m, + }; + // Unlike the non-overlapping case, we're OK with empty matches at this + // level. In particular, the overlapping search algorithm is itself + // responsible for ensuring that progress is always made. + self.last_end = m.end(); + Some(Ok(m)) + } +} + +/// The configuration used for compiling a DFA-backed regex. +/// +/// A regex configuration is a simple data object that is typically used with +/// [`Builder::configure`]. +#[cfg(feature = "alloc")] +#[derive(Clone, Copy, Debug, Default)] +pub struct Config { + utf8: Option, +} + +#[cfg(feature = "alloc")] +impl Config { + /// Return a new default regex compiler configuration. + pub fn new() -> Config { + Config::default() + } + + /// Whether to enable UTF-8 mode or not. + /// + /// When UTF-8 mode is enabled (the default) and an empty match is seen, + /// the iterators on [`Regex`] will always start the next search at the + /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 + /// mode is disabled, such searches are begun at the next byte offset. + /// + /// If this mode is enabled and invalid UTF-8 is given to search, then + /// behavior is unspecified. + /// + /// Generally speaking, one should enable this when + /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) + /// and + /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) + /// are enabled, and disable it otherwise. + /// + /// # Example + /// + /// This example demonstrates the differences between when this option is + /// enabled and disabled. The differences only arise when the regex can + /// return matches of length zero. + /// + /// In this first snippet, we show the results when UTF-8 mode is disabled. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(false)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + /// + /// And in this snippet, we execute the same search on the same haystack, + /// but with UTF-8 mode enabled. Notice that byte offsets that would + /// otherwise split the encoding of `☃` are not returned. + /// + /// ``` + /// use regex_automata::{dfa::regex::Regex, MultiMatch}; + /// + /// let re = Regex::builder() + /// .configure(Regex::config().utf8(true)) + /// .build(r"")?; + /// let haystack = "a☃z".as_bytes(); + /// let mut it = re.find_leftmost_iter(haystack); + /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); + /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box>(()) + /// ``` + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + + /// Returns true if and only if this configuration has UTF-8 mode enabled. + /// + /// When UTF-8 mode is enabled and an empty match is seen, the iterators on + /// [`Regex`] will always start the next search at the next UTF-8 encoded + /// codepoint. When UTF-8 mode is disabled, such searches are begun at the + /// next byte offset. + pub fn get_utf8(&self) -> bool { + self.utf8.unwrap_or(true) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(self, o: Config) -> Config { + Config { utf8: o.utf8.or(self.utf8) } + } +} + +/// A builder for a regex based on deterministic finite automatons. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction, the DFA construction and finally the regex searching +/// itself. This builder is different from a general purpose regex builder in +/// that it permits fine grain configuration of the construction process. The +/// trade off for this is complexity, and the possibility of setting a +/// configuration that might not make sense. For example, there are three +/// different UTF-8 modes: +/// +/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the +/// pattern itself can contain sub-expressions that match invalid UTF-8. +/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) +/// controls whether the implicit unanchored prefix added to the NFA can +/// match through invalid UTF-8 or not. +/// * [`Config::utf8`] controls how the regex iterators themselves advance +/// the starting position of the next search when a match with zero length is +/// found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// Internally, building a regex requires building two DFAs, where one is +/// responsible for finding the end of a match and the other is responsible +/// for finding the start of a match. If you only need to detect whether +/// something matched, or only the end of a match, then you should use a +/// [`dense::Builder`] to construct a single DFA, which is cheaper than +/// building two DFAs. +/// +/// # Build methods +/// +/// This builder has a few "build" methods. In general, it's the result of +/// combining the following parameters: +/// +/// * Building one or many regexes. +/// * Building a regex with dense or sparse DFAs. +/// +/// The simplest "build" method is [`Builder::build`]. It accepts a single +/// pattern and builds a dense DFA using `usize` for the state identifier +/// representation. +/// +/// The most general "build" method is [`Builder::build_many`], which permits +/// building a regex that searches for multiple patterns simultaneously while +/// using a specific state identifier representation. +/// +/// The most flexible "build" method, but hardest to use, is +/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is +/// just a pair of DFAs, and this method allows you to specify those DFAs +/// exactly. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax, the NFA and +/// the regex itself. This is generally what you want for matching on +/// arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig +/// }; +/// +/// let re = Regex::builder() +/// .configure(Regex::config().utf8(false)) +/// .syntax(SyntaxConfig::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(MultiMatch::must(0, 1, 9)); +/// let got = re.find_leftmost(haystack); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. Notice also that the +/// // search was unanchored and skipped over invalid UTF-8. +/// // Disabling UTF-8 on the Thompson NFA permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box>(()) +/// ``` +#[cfg(feature = "alloc")] +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + dfa: dense::Builder, +} + +#[cfg(feature = "alloc")] +impl Builder { + /// Create a new regex builder with the default configuration. + pub fn new() -> Builder { + Builder { config: Config::default(), dfa: dense::Builder::new() } + } + + /// Build a regex from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build(&self, pattern: &str) -> Result { + self.build_many(&[pattern]) + } + + /// Build a regex from the given pattern using sparse DFAs. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + pub fn build_sparse( + &self, + pattern: &str, + ) -> Result>>, Error> { + self.build_many_sparse(&[pattern]) + } + + /// Build a regex from the given patterns. + pub fn build_many>( + &self, + patterns: &[P], + ) -> Result { + let forward = self.dfa.build_many(patterns)?; + let reverse = self + .dfa + .clone() + .configure( + dense::Config::new() + .anchored(true) + .match_kind(MatchKind::All) + .starts_for_each_pattern(true), + ) + .thompson(thompson::Config::new().reverse(true)) + .build_many(patterns)?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a sparse regex from the given patterns. + pub fn build_many_sparse>( + &self, + patterns: &[P], + ) -> Result>>, Error> { + let re = self.build_many(patterns)?; + let forward = re.forward().to_sparse()?; + let reverse = re.reverse().to_sparse()?; + Ok(self.build_from_dfas(forward, reverse)) + } + + /// Build a regex from its component forward and reverse DFAs. + /// + /// This is useful when deserializing a regex from some arbitrary + /// memory region. This is also useful for building regexes from other + /// types of DFAs. + /// + /// If you're building the DFAs from scratch instead of building new DFAs + /// from other DFAs, then you'll need to make sure that the reverse DFA is + /// configured correctly to match the intended semantics. Namely: + /// + /// * It should be anchored. + /// * It should use [`MatchKind::All`] semantics. + /// * It should match in reverse. + /// * It should have anchored start states compiled for each pattern. + /// * Otherwise, its configuration should match the forward DFA. + /// + /// If these conditions are satisfied, then behavior of searches is + /// unspecified. + /// + /// Note that when using this constructor, only the configuration from + /// [`Config`] is applied. The only configuration settings on this builder + /// only apply when the builder owns the construction of the DFAs + /// themselves. + /// + /// # Example + /// + /// This example is a bit a contrived. The usual use of these methods + /// would involve serializing `initial_re` somewhere and then deserializing + /// it later to build a regex. But in this case, we do everything in + /// memory. + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// This example shows how to build a `Regex` that uses sparse DFAs instead + /// of dense DFAs without using one of the convenience `build_sparse` + /// routines: + /// + /// ``` + /// use regex_automata::dfa::regex::Regex; + /// + /// let initial_re = Regex::new("foo[0-9]+")?; + /// assert_eq!(true, initial_re.is_match(b"foo123")); + /// + /// let fwd = initial_re.forward().to_sparse()?; + /// let rev = initial_re.reverse().to_sparse()?; + /// let re = Regex::builder().build_from_dfas(fwd, rev); + /// assert_eq!(true, re.is_match(b"foo123")); + /// # Ok::<(), Box>(()) + /// ``` + pub fn build_from_dfas( + &self, + forward: A, + reverse: A, + ) -> Regex { + let utf8 = self.config.get_utf8(); + Regex { prefilter: None, forward, reverse, utf8 } + } + + /// Apply the given regex configuration options to this builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`SyntaxConfig`](crate::SyntaxConfig). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + pub fn syntax( + &mut self, + config: crate::util::syntax::SyntaxConfig, + ) -> &mut Builder { + self.dfa.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](thompson::Config). + /// + /// This permits setting things like whether additional time should be + /// spent shrinking the size of the NFA. + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.dfa.thompson(config); + self + } + + /// Set the dense DFA compilation configuration for this builder using + /// [`dense::Config`](dense::Config). + /// + /// This permits setting things like whether the underlying DFAs should + /// be minimized. + pub fn dense(&mut self, config: dense::Config) -> &mut Builder { + self.dfa.configure(config); + self + } +} + +#[cfg(feature = "alloc")] +impl Default for Builder { + fn default() -> Builder { + Builder::new() + } +} + +#[inline(always)] +fn next_unwrap( + item: Option>, +) -> Option { + match item { + None => None, + Some(Ok(m)) => Some(m), + Some(Err(err)) => panic!( + "unexpected regex search error: {}\n\ + to handle search errors, use try_ methods", + err, + ), + } +} -- cgit v1.2.3