/*! A DFA-backed `Regex`. This module provides [`Regex`], which is defined generically over the [`Automaton`] trait. A `Regex` implements convenience routines you might have come to expect, such as finding the start/end of a match and iterating over all non-overlapping matches. This `Regex` type is limited in its capabilities to what a DFA can provide. Therefore, APIs involving capturing groups, for example, are not provided. Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that finds the end offset of a match, where as the other is a "reverse" DFA that find the start offset of a match. See the [parent module](crate::dfa) for examples. */ #[cfg(feature = "alloc")] use alloc::vec::Vec; use crate::{ dfa::automaton::{Automaton, OverlappingState}, util::prefilter::{self, Prefilter}, MatchError, MultiMatch, }; #[cfg(feature = "alloc")] use crate::{ dfa::{dense, error::Error, sparse}, nfa::thompson, util::matchtypes::MatchKind, }; // When the alloc feature is enabled, the regex type sets its A type parameter // to default to an owned dense DFA. But without alloc, we set no default. This // makes things a lot more convenient in the common case, since writing out the // DFA types is pretty annoying. // // Since we have two different definitions but only want to write one doc // string, we use a macro to capture the doc and other attributes once and then // repeat them for each definition. macro_rules! define_regex_type { ($(#[$doc:meta])*) => { #[cfg(feature = "alloc")] $(#[$doc])* pub struct Regex { prefilter: Option

, forward: A, reverse: A, utf8: bool, } #[cfg(not(feature = "alloc"))] $(#[$doc])* pub struct Regex { prefilter: Option

, forward: A, reverse: A, utf8: bool, } }; } define_regex_type!( /// A regular expression that uses deterministic finite automata for fast /// searching. /// /// A regular expression is comprised of two DFAs, a "forward" DFA and a /// "reverse" DFA. The forward DFA is responsible for detecting the end of /// a match while the reverse DFA is responsible for detecting the start /// of a match. Thus, in order to find the bounds of any given match, a /// forward search must first be run followed by a reverse search. A match /// found by the forward DFA guarantees that the reverse DFA will also find /// a match. /// /// The type of the DFA used by a `Regex` corresponds to the `A` type /// parameter, which must satisfy the [`Automaton`] trait. Typically, /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more /// memory but search faster, while sparse DFAs use less memory but search /// more slowly. /// /// By default, a regex's automaton type parameter is set to /// `dense::DFA>` when the `alloc` feature is enabled. For most /// in-memory work loads, this is the most convenient type that gives the /// best search performance. When the `alloc` feature is disabled, no /// default type is used. /// /// A `Regex` also has a `P` type parameter, which is used to select the /// prefilter used during search. By default, no prefilter is enabled by /// setting the type to default to [`prefilter::None`]. A prefilter can be /// enabled by using the [`Regex::prefilter`] method. /// /// # When should I use this? /// /// Generally speaking, if you can afford the overhead of building a full /// DFA for your regex, and you don't need things like capturing groups, /// then this is a good choice if you're looking to optimize for matching /// speed. Note however that its speed may be worse than a general purpose /// regex engine if you don't select a good [prefilter]. /// /// # Earliest vs Leftmost vs Overlapping /// /// The search routines exposed on a `Regex` reflect three different ways /// of searching: /// /// * "earliest" means to stop as soon as a match has been detected. /// * "leftmost" means to continue matching until the underlying /// automaton cannot advance. This reflects "standard" searching you /// might be used to in other regex engines. e.g., This permits /// non-greedy and greedy searching to work as you would expect. /// * "overlapping" means to find all possible matches, even if they /// overlap. /// /// Generally speaking, when doing an overlapping search, you'll want to /// build your regex DFAs with [`MatchKind::All`] semantics. Using /// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is /// likely to lead to odd behavior since `LeftmostFirst` specifically omits /// some matches that can never be reported due to its semantics. /// /// The following example shows the differences between how these different /// types of searches impact looking for matches of `[a-z]+` in the /// haystack `abc`. /// /// ``` /// use regex_automata::{dfa::{self, dense}, MatchKind, MultiMatch}; /// /// let pattern = r"[a-z]+"; /// let haystack = "abc".as_bytes(); /// /// // With leftmost-first semantics, we test "earliest" and "leftmost". /// let re = dfa::regex::Builder::new() /// .dense(dense::Config::new().match_kind(MatchKind::LeftmostFirst)) /// .build(pattern)?; /// /// // "earliest" searching isn't impacted by greediness /// let mut it = re.find_earliest_iter(haystack); /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); /// assert_eq!(None, it.next()); /// /// // "leftmost" searching supports greediness (and non-greediness) /// let mut it = re.find_leftmost_iter(haystack); /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); /// assert_eq!(None, it.next()); /// /// // For overlapping, we want "all" match kind semantics. /// let re = dfa::regex::Builder::new() /// .dense(dense::Config::new().match_kind(MatchKind::All)) /// .build(pattern)?; /// /// // In the overlapping search, we find all three possible matches /// // starting at the beginning of the haystack. /// let mut it = re.find_overlapping_iter(haystack); /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Sparse DFAs /// /// Since a `Regex` is generic over the [`Automaton`] trait, it can be /// used with any kind of DFA. While this crate constructs dense DFAs by /// default, it is easy enough to build corresponding sparse DFAs, and then /// build a regex from them: /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// // First, build a regex that uses dense DFAs. /// let dense_re = Regex::new("foo[0-9]+")?; /// /// // Second, build sparse DFAs from the forward and reverse dense DFAs. /// let fwd = dense_re.forward().to_sparse()?; /// let rev = dense_re.reverse().to_sparse()?; /// /// // Third, build a new regex from the constituent sparse DFAs. /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); /// /// // A regex that uses sparse DFAs can be used just like with dense DFAs. /// assert_eq!(true, sparse_re.is_match(b"foo123")); /// /// # Ok::<(), Box>(()) /// ``` /// /// Alternatively, one can use a [`Builder`] to construct a sparse DFA /// more succinctly. (Note though that dense DFAs are still constructed /// first internally, and then converted to sparse DFAs, as in the example /// above.) /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; /// // A regex that uses sparse DFAs can be used just like with dense DFAs. /// assert!(sparse_re.is_match(b"foo123")); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Fallibility /// /// In non-default configurations, the DFAs generated in this module may /// return an error during a search. (Currently, the only way this happens /// is if quit bytes are added or Unicode word boundaries are heuristically /// enabled, both of which are turned off by default.) For convenience, the /// main search routines, like [`find_leftmost`](Regex::find_leftmost), /// will panic if an error occurs. However, if you need to use DFAs /// which may produce an error at search time, then there are fallible /// equivalents of all search routines. For example, for `find_leftmost`, /// its fallible analog is [`try_find_leftmost`](Regex::try_find_leftmost). /// The routines prefixed with `try_` return `Result, /// MatchError>`, where as the infallible routines simply return /// `Option`. /// /// # Example /// /// This example shows how to cause a search to terminate if it sees a /// `\n` byte, and handle the error returned. This could be useful if, for /// example, you wanted to prevent a user supplied pattern from matching /// across a line boundary. /// /// ``` /// use regex_automata::{dfa::{self, regex::Regex}, MatchError}; /// /// let re = Regex::builder() /// .dense(dfa::dense::Config::new().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// /// let haystack = "foo\nbar".as_bytes(); /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 }; /// let got = re.try_find_leftmost(haystack).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] ); #[cfg(feature = "alloc")] impl Regex { /// Parse the given regular expression using the default configuration and /// return the corresponding regex. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{MultiMatch, dfa::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+bar")?; /// assert_eq!( /// Some(MultiMatch::must(0, 3, 14)), /// re.find_leftmost(b"zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box>(()) /// ``` pub fn new(pattern: &str) -> Result { Builder::new().build(pattern) } /// Like `new`, but parses multiple patterns into a single "regex set." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` /// use regex_automata::{MultiMatch, dfa::regex::Regex}; /// /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; /// /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` pub fn new_many>(patterns: &[P]) -> Result { Builder::new().build_many(patterns) } } #[cfg(feature = "alloc")] impl Regex>> { /// Parse the given regular expression using the default configuration, /// except using sparse DFAs, and return the corresponding regex. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{MultiMatch, dfa::regex::Regex}; /// /// let re = Regex::new_sparse("foo[0-9]+bar")?; /// assert_eq!( /// Some(MultiMatch::must(0, 3, 14)), /// re.find_leftmost(b"zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box>(()) /// ``` pub fn new_sparse( pattern: &str, ) -> Result>>, Error> { Builder::new().build_sparse(pattern) } /// Like `new`, but parses multiple patterns into a single "regex set" /// using sparse DFAs. This otherwise similarly uses the default regex /// configuration. /// /// # Example /// /// ``` /// use regex_automata::{MultiMatch, dfa::regex::Regex}; /// /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; /// /// let mut it = re.find_leftmost_iter(b"abc 1 foo 4567 0 quux"); /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next()); /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next()); /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next()); /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` pub fn new_many_sparse>( patterns: &[P], ) -> Result>>, Error> { Builder::new().build_many_sparse(patterns) } } /// Convenience routines for regex construction. #[cfg(feature = "alloc")] impl Regex { /// Return a default configuration for a `Regex`. /// /// This is a convenience routine to avoid needing to import the `Config` /// type when customizing the construction of a regex. /// /// # Example /// /// This example shows how to disable UTF-8 mode for `Regex` iteration. /// When UTF-8 mode is disabled, the position immediately following an /// empty match is where the next search begins, instead of the next /// position of a UTF-8 encoded codepoint. /// /// ``` /// use regex_automata::{dfa::regex::Regex, MultiMatch}; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8(false)) /// .build(r"")?; /// let haystack = "a☃z".as_bytes(); /// let mut it = re.find_leftmost_iter(haystack); /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` pub fn config() -> Config { Config::new() } /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode /// everywhere. /// /// ``` /// use regex_automata::{ /// dfa::regex::Regex, /// nfa::thompson, /// MultiMatch, SyntaxConfig, /// }; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8(false)) /// .syntax(SyntaxConfig::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(MultiMatch::must(0, 1, 9)); /// let got = re.find_leftmost(haystack); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } } /// Standard search routines for finding and iterating over matches. impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if the underlying /// DFA enters a match state or a dead state, then this routine will return /// `true` or `false`, respectively, without inspecting any future input. /// /// # Panics /// /// If the underlying DFAs return an error, then this routine panics. This /// only occurs in non-default configurations where quit bytes are used or /// Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_is_match`](Regex::try_is_match). /// /// # Example /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// let re = Regex::new("foo[0-9]+bar")?; /// assert_eq!(true, re.is_match(b"foo12345bar")); /// assert_eq!(false, re.is_match(b"foobar")); /// # Ok::<(), Box>(()) /// ``` pub fn is_match(&self, haystack: &[u8]) -> bool { self.is_match_at(haystack, 0, haystack.len()) } /// Returns the first position at which a match is found. /// /// This routine stops scanning input in precisely the same circumstances /// as `is_match`. The key difference is that this routine returns the /// position at which it stopped scanning input if and only if a match /// was found. If no match is found, then `None` is returned. /// /// # Panics /// /// If the underlying DFAs return an error, then this routine panics. This /// only occurs in non-default configurations where quit bytes are used or /// Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_earliest`](Regex::try_find_earliest). /// /// # Example /// /// ``` /// use regex_automata::{MultiMatch, dfa::regex::Regex}; /// /// // Normally, the leftmost first match would greedily consume as many /// // decimal digits as it could. But a match is detected as soon as one /// // digit is seen. /// let re = Regex::new("foo[0-9]+")?; /// assert_eq!( /// Some(MultiMatch::must(0, 0, 4)), /// re.find_earliest(b"foo12345"), /// ); /// /// // Normally, the end of the leftmost first match here would be 3, /// // but the "earliest" match semantics detect a match earlier. /// let re = Regex::new("abc|a")?; /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), re.find_earliest(b"abc")); /// # Ok::<(), Box>(()) /// ``` pub fn find_earliest(&self, haystack: &[u8]) -> Option { self.find_earliest_at(haystack, 0, haystack.len()) } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// # Panics /// /// If the underlying DFAs return an error, then this routine panics. This /// only occurs in non-default configurations where quit bytes are used or /// Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_leftmost`](Regex::try_find_leftmost). /// /// # Example /// /// ``` /// use regex_automata::{MultiMatch, dfa::regex::Regex}; /// /// // Greediness is applied appropriately when compared to find_earliest. /// let re = Regex::new("foo[0-9]+")?; /// assert_eq!( /// Some(MultiMatch::must(0, 3, 11)), /// re.find_leftmost(b"zzzfoo12345zzz"), /// ); /// /// // Even though a match is found after reading the first byte (`a`), /// // the default leftmost-first match semantics demand that we find the /// // earliest match that prefers earlier parts of the pattern over latter /// // parts. /// let re = Regex::new("abc|a")?; /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), re.find_leftmost(b"abc")); /// # Ok::<(), Box>(()) /// ``` pub fn find_leftmost(&self, haystack: &[u8]) -> Option { self.find_leftmost_at(haystack, 0, haystack.len()) } /// Search for the first overlapping match in `haystack`. /// /// This routine is principally useful when searching for multiple patterns /// on inputs where multiple patterns may match the same regions of text. /// In particular, callers must preserve the automaton's search state from /// prior calls so that the implementation knows where the last match /// occurred and which pattern was reported. /// /// # Panics /// /// If the underlying DFAs return an error, then this routine panics. This /// only occurs in non-default configurations where quit bytes are used or /// Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_overlapping`](Regex::try_find_overlapping). /// /// # Example /// /// This example shows how to run an overlapping search with multiple /// regexes. /// /// ``` /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; /// /// let re = Regex::builder() /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) /// .build_many(&[r"\w+$", r"\S+$"])?; /// let haystack = "@foo".as_bytes(); /// let mut state = dfa::OverlappingState::start(); /// /// let expected = Some(MultiMatch::must(1, 0, 4)); /// let got = re.find_overlapping(haystack, &mut state); /// assert_eq!(expected, got); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first /// // pattern is returned after the second. This is because the second /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(MultiMatch::must(0, 1, 4)); /// let got = re.find_overlapping(haystack, &mut state); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn find_overlapping( &self, haystack: &[u8], state: &mut OverlappingState, ) -> Option { self.find_overlapping_at(haystack, 0, haystack.len(), state) } /// Returns an iterator over all non-overlapping "earliest" matches. /// /// Match positions are reported as soon as a match is known to occur, even /// if the standard leftmost match would be longer. /// /// # Panics /// /// If the underlying DFAs return an error during iteration, then iteration /// panics. This only occurs in non-default configurations where quit bytes /// are used or Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter). /// /// # Example /// /// This example shows how to run an "earliest" iterator. /// /// ``` /// use regex_automata::{dfa::regex::Regex, MultiMatch}; /// /// let re = Regex::new("[0-9]+")?; /// let haystack = "123".as_bytes(); /// /// // Normally, a standard leftmost iterator would return a single /// // match, but since "earliest" detects matches earlier, we get /// // three matches. /// let mut it = re.find_earliest_iter(haystack); /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` pub fn find_earliest_iter<'r, 't>( &'r self, haystack: &'t [u8], ) -> FindEarliestMatches<'r, 't, A, P> { FindEarliestMatches::new(self, haystack) } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// This corresponds to the "standard" regex search iterator. /// /// # Panics /// /// If the underlying DFAs return an error during iteration, then iteration /// panics. This only occurs in non-default configurations where quit bytes /// are used or Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter). /// /// # Example /// /// ``` /// use regex_automata::{MultiMatch, dfa::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+")?; /// let text = b"foo1 foo12 foo123"; /// let matches: Vec = re.find_leftmost_iter(text).collect(); /// assert_eq!(matches, vec![ /// MultiMatch::must(0, 0, 4), /// MultiMatch::must(0, 5, 10), /// MultiMatch::must(0, 11, 17), /// ]); /// # Ok::<(), Box>(()) /// ``` pub fn find_leftmost_iter<'r, 't>( &'r self, haystack: &'t [u8], ) -> FindLeftmostMatches<'r, 't, A, P> { FindLeftmostMatches::new(self, haystack) } /// Returns an iterator over all overlapping matches in the given haystack. /// /// This routine is principally useful when searching for multiple patterns /// on inputs where multiple patterns may match the same regions of text. /// The iterator takes care of handling the overlapping state that must be /// threaded through every search. /// /// # Panics /// /// If the underlying DFAs return an error during iteration, then iteration /// panics. This only occurs in non-default configurations where quit bytes /// are used or Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter). /// /// # Example /// /// This example shows how to run an overlapping search with multiple /// regexes. /// /// ``` /// use regex_automata::{dfa::{self, regex::Regex}, MatchKind, MultiMatch}; /// /// let re = Regex::builder() /// .dense(dfa::dense::Config::new().match_kind(MatchKind::All)) /// .build_many(&[r"\w+$", r"\S+$"])?; /// let haystack = "@foo".as_bytes(); /// /// let mut it = re.find_overlapping_iter(haystack); /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` pub fn find_overlapping_iter<'r, 't>( &'r self, haystack: &'t [u8], ) -> FindOverlappingMatches<'r, 't, A, P> { FindOverlappingMatches::new(self, haystack) } } /// Lower level infallible search routines that permit controlling where /// the search starts and ends in a particular sequence. This is useful for /// executing searches that need to take surrounding context into account. This /// is required for correctly implementing iteration because of look-around /// operators (`^`, `$`, `\b`). impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if the underlying /// DFA enters a match state or a dead state, then this routine will return /// `true` or `false`, respectively, without inspecting any future input. /// /// # Searching a substring of the haystack /// /// Being an "at" search routine, this permits callers to search a /// substring of `haystack` by specifying a range in `haystack`. /// Why expose this as an API instead of just asking callers to use /// `&input[start..end]`? The reason is that regex matching often wants /// to take the surrounding context into account in order to handle /// look-around (`^`, `$` and `\b`). /// /// # Panics /// /// If the underlying DFAs return an error, then this routine panics. This /// only occurs in non-default configurations where quit bytes are used or /// Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_is_match_at`](Regex::try_is_match_at). pub fn is_match_at( &self, haystack: &[u8], start: usize, end: usize, ) -> bool { self.try_is_match_at(haystack, start, end).unwrap() } /// Returns the first position at which a match is found. /// /// This routine stops scanning input in precisely the same circumstances /// as `is_match`. The key difference is that this routine returns the /// position at which it stopped scanning input if and only if a match /// was found. If no match is found, then `None` is returned. /// /// # Searching a substring of the haystack /// /// Being an "at" search routine, this permits callers to search a /// substring of `haystack` by specifying a range in `haystack`. /// Why expose this as an API instead of just asking callers to use /// `&input[start..end]`? The reason is that regex matching often wants /// to take the surrounding context into account in order to handle /// look-around (`^`, `$` and `\b`). /// /// This is useful when implementing an iterator over matches /// within the same haystack, which cannot be done correctly by simply /// providing a subslice of `haystack`. /// /// # Panics /// /// If the underlying DFAs return an error, then this routine panics. This /// only occurs in non-default configurations where quit bytes are used or /// Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_earliest_at`](Regex::try_find_earliest_at). pub fn find_earliest_at( &self, haystack: &[u8], start: usize, end: usize, ) -> Option { self.try_find_earliest_at(haystack, start, end).unwrap() } /// Returns the same as `find_leftmost`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. /// /// # Searching a substring of the haystack /// /// Being an "at" search routine, this permits callers to search a /// substring of `haystack` by specifying a range in `haystack`. /// Why expose this as an API instead of just asking callers to use /// `&input[start..end]`? The reason is that regex matching often wants /// to take the surrounding context into account in order to handle /// look-around (`^`, `$` and `\b`). /// /// This is useful when implementing an iterator over matches within the /// same haystack, which cannot be done correctly by simply providing a /// subslice of `haystack`. /// /// # Panics /// /// If the underlying DFAs return an error, then this routine panics. This /// only occurs in non-default configurations where quit bytes are used or /// Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at). pub fn find_leftmost_at( &self, haystack: &[u8], start: usize, end: usize, ) -> Option { self.try_find_leftmost_at(haystack, start, end).unwrap() } /// Search for the first overlapping match within a given range of /// `haystack`. /// /// This routine is principally useful when searching for multiple patterns /// on inputs where multiple patterns may match the same regions of text. /// In particular, callers must preserve the automaton's search state from /// prior calls so that the implementation knows where the last match /// occurred and which pattern was reported. /// /// # Searching a substring of the haystack /// /// Being an "at" search routine, this permits callers to search a /// substring of `haystack` by specifying a range in `haystack`. /// Why expose this as an API instead of just asking callers to use /// `&input[start..end]`? The reason is that regex matching often wants /// to take the surrounding context into account in order to handle /// look-around (`^`, `$` and `\b`). /// /// This is useful when implementing an iterator over matches /// within the same haystack, which cannot be done correctly by simply /// providing a subslice of `haystack`. /// /// # Panics /// /// If the underlying DFAs return an error, then this routine panics. This /// only occurs in non-default configurations where quit bytes are used or /// Unicode word boundaries are heuristically enabled. /// /// The fallible version of this routine is /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at). pub fn find_overlapping_at( &self, haystack: &[u8], start: usize, end: usize, state: &mut OverlappingState, ) -> Option { self.try_find_overlapping_at(haystack, start, end, state).unwrap() } } /// Fallible search routines. These may return an error when the underlying /// DFAs have been configured in a way that permits them to fail during a /// search. /// /// Errors during search only occur when the DFA has been explicitly /// configured to do so, usually by specifying one or more "quit" bytes or by /// heuristically enabling Unicode word boundaries. /// /// Errors will never be returned using the default configuration. So these /// fallible routines are only needed for particular configurations. impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if the underlying /// DFA enters a match state or a dead state, then this routine will return /// `true` or `false`, respectively, without inspecting any future input. /// /// # Errors /// /// This routine only errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`is_match`](Regex::is_match). pub fn try_is_match(&self, haystack: &[u8]) -> Result { self.try_is_match_at(haystack, 0, haystack.len()) } /// Returns the first position at which a match is found. /// /// This routine stops scanning input in precisely the same circumstances /// as `is_match`. The key difference is that this routine returns the /// position at which it stopped scanning input if and only if a match /// was found. If no match is found, then `None` is returned. /// /// # Errors /// /// This routine only errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_earliest`](Regex::find_earliest). pub fn try_find_earliest( &self, haystack: &[u8], ) -> Result, MatchError> { self.try_find_earliest_at(haystack, 0, haystack.len()) } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// # Errors /// /// This routine only errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_leftmost`](Regex::find_leftmost). pub fn try_find_leftmost( &self, haystack: &[u8], ) -> Result, MatchError> { self.try_find_leftmost_at(haystack, 0, haystack.len()) } /// Search for the first overlapping match in `haystack`. /// /// This routine is principally useful when searching for multiple patterns /// on inputs where multiple patterns may match the same regions of text. /// In particular, callers must preserve the automaton's search state from /// prior calls so that the implementation knows where the last match /// occurred and which pattern was reported. /// /// # Errors /// /// This routine only errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_overlapping`](Regex::find_overlapping). pub fn try_find_overlapping( &self, haystack: &[u8], state: &mut OverlappingState, ) -> Result, MatchError> { self.try_find_overlapping_at(haystack, 0, haystack.len(), state) } /// Returns an iterator over all non-overlapping "earliest" matches. /// /// Match positions are reported as soon as a match is known to occur, even /// if the standard leftmost match would be longer. /// /// # Errors /// /// This iterator only yields errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_earliest_iter`](Regex::find_earliest_iter). pub fn try_find_earliest_iter<'r, 't>( &'r self, haystack: &'t [u8], ) -> TryFindEarliestMatches<'r, 't, A, P> { TryFindEarliestMatches::new(self, haystack) } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// This corresponds to the "standard" regex search iterator. /// /// # Errors /// /// This iterator only yields errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_leftmost_iter`](Regex::find_leftmost_iter). pub fn try_find_leftmost_iter<'r, 't>( &'r self, haystack: &'t [u8], ) -> TryFindLeftmostMatches<'r, 't, A, P> { TryFindLeftmostMatches::new(self, haystack) } /// Returns an iterator over all overlapping matches in the given haystack. /// /// This routine is principally useful when searching for multiple patterns /// on inputs where multiple patterns may match the same regions of text. /// The iterator takes care of handling the overlapping state that must be /// threaded through every search. /// /// # Errors /// /// This iterator only yields errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_overlapping_iter`](Regex::find_overlapping_iter). pub fn try_find_overlapping_iter<'r, 't>( &'r self, haystack: &'t [u8], ) -> TryFindOverlappingMatches<'r, 't, A, P> { TryFindOverlappingMatches::new(self, haystack) } } /// Lower level fallible search routines that permit controlling where the /// search starts and ends in a particular sequence. impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if the underlying /// DFA enters a match state or a dead state, then this routine will return /// `true` or `false`, respectively, without inspecting any future input. /// /// # Searching a substring of the haystack /// /// Being an "at" search routine, this permits callers to search a /// substring of `haystack` by specifying a range in `haystack`. /// Why expose this as an API instead of just asking callers to use /// `&input[start..end]`? The reason is that regex matching often wants /// to take the surrounding context into account in order to handle /// look-around (`^`, `$` and `\b`). /// /// # Errors /// /// This routine only errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used, Unicode word boundaries are heuristically /// enabled or limits are set on the number of times the lazy DFA's cache /// may be cleared. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`is_match_at`](Regex::is_match_at). pub fn try_is_match_at( &self, haystack: &[u8], start: usize, end: usize, ) -> Result { self.forward() .find_earliest_fwd_at( self.scanner().as_mut(), None, haystack, start, end, ) .map(|x| x.is_some()) } /// Returns the first position at which a match is found. /// /// This routine stops scanning input in precisely the same circumstances /// as `is_match`. The key difference is that this routine returns the /// position at which it stopped scanning input if and only if a match /// was found. If no match is found, then `None` is returned. /// /// # Searching a substring of the haystack /// /// Being an "at" search routine, this permits callers to search a /// substring of `haystack` by specifying a range in `haystack`. /// Why expose this as an API instead of just asking callers to use /// `&input[start..end]`? The reason is that regex matching often wants /// to take the surrounding context into account in order to handle /// look-around (`^`, `$` and `\b`). /// /// This is useful when implementing an iterator over matches /// within the same haystack, which cannot be done correctly by simply /// providing a subslice of `haystack`. /// /// # Errors /// /// This routine only errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_earliest_at`](Regex::find_earliest_at). pub fn try_find_earliest_at( &self, haystack: &[u8], start: usize, end: usize, ) -> Result, MatchError> { self.try_find_earliest_at_imp( self.scanner().as_mut(), haystack, start, end, ) } /// The implementation of "earliest" searching, where a prefilter scanner /// may be given. fn try_find_earliest_at_imp( &self, pre: Option<&mut prefilter::Scanner>, haystack: &[u8], start: usize, end: usize, ) -> Result, MatchError> { // N.B. We use `&&A` here to call `Automaton` methods, which ensures // that we always use the `impl Automaton for &A` for calling methods. // Since this is the usual way that automata are used, this helps // reduce the number of monomorphized copies of the search code. let (fwd, rev) = (self.forward(), self.reverse()); let end = match (&fwd) .find_earliest_fwd_at(pre, None, haystack, start, end)? { None => return Ok(None), Some(end) => end, }; // N.B. The only time we need to tell the reverse searcher the pattern // to match is in the overlapping case, since it's ambiguous. In the // leftmost case, I have tentatively convinced myself that it isn't // necessary and the reverse search will always find the same pattern // to match as the forward search. But I lack a rigorous proof. let start = (&rev) .find_earliest_rev_at(None, haystack, start, end.offset())? .expect("reverse search must match if forward search does"); assert_eq!( start.pattern(), end.pattern(), "forward and reverse search must match same pattern" ); assert!(start.offset() <= end.offset()); Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// # Searching a substring of the haystack /// /// Being an "at" search routine, this permits callers to search a /// substring of `haystack` by specifying a range in `haystack`. /// Why expose this as an API instead of just asking callers to use /// `&input[start..end]`? The reason is that regex matching often wants /// to take the surrounding context into account in order to handle /// look-around (`^`, `$` and `\b`). /// /// This is useful when implementing an iterator over matches /// within the same haystack, which cannot be done correctly by simply /// providing a subslice of `haystack`. /// /// # Errors /// /// This routine only errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_leftmost_at`](Regex::find_leftmost_at). pub fn try_find_leftmost_at( &self, haystack: &[u8], start: usize, end: usize, ) -> Result, MatchError> { self.try_find_leftmost_at_imp( self.scanner().as_mut(), haystack, start, end, ) } /// The implementation of leftmost searching, where a prefilter scanner /// may be given. fn try_find_leftmost_at_imp( &self, scanner: Option<&mut prefilter::Scanner>, haystack: &[u8], start: usize, end: usize, ) -> Result, MatchError> { // N.B. We use `&&A` here to call `Automaton` methods, which ensures // that we always use the `impl Automaton for &A` for calling methods. // Since this is the usual way that automata are used, this helps // reduce the number of monomorphized copies of the search code. let (fwd, rev) = (self.forward(), self.reverse()); let end = match (&fwd) .find_leftmost_fwd_at(scanner, None, haystack, start, end)? { None => return Ok(None), Some(end) => end, }; // N.B. The only time we need to tell the reverse searcher the pattern // to match is in the overlapping case, since it's ambiguous. In the // leftmost case, I have tentatively convinced myself that it isn't // necessary and the reverse search will always find the same pattern // to match as the forward search. But I lack a rigorous proof. Why not // just provide the pattern anyway? Well, if it is needed, then leaving // it out gives us a chance to find a witness. let start = (&rev) .find_leftmost_rev_at(None, haystack, start, end.offset())? .expect("reverse search must match if forward search does"); assert_eq!( start.pattern(), end.pattern(), "forward and reverse search must match same pattern", ); assert!(start.offset() <= end.offset()); Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) } /// Search for the first overlapping match within a given range of /// `haystack`. /// /// This routine is principally useful when searching for multiple patterns /// on inputs where multiple patterns may match the same regions of text. /// In particular, callers must preserve the automaton's search state from /// prior calls so that the implementation knows where the last match /// occurred and which pattern was reported. /// /// # Searching a substring of the haystack /// /// Being an "at" search routine, this permits callers to search a /// substring of `haystack` by specifying a range in `haystack`. /// Why expose this as an API instead of just asking callers to use /// `&input[start..end]`? The reason is that regex matching often wants /// to take the surrounding context into account in order to handle /// look-around (`^`, `$` and `\b`). /// /// This is useful when implementing an iterator over matches /// within the same haystack, which cannot be done correctly by simply /// providing a subslice of `haystack`. /// /// # Errors /// /// This routine only errors if the search could not complete. For /// DFA-based regexes, this only occurs in a non-default configuration /// where quit bytes are used or Unicode word boundaries are heuristically /// enabled. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// The infallible (panics on error) version of this routine is /// [`find_overlapping_at`](Regex::find_overlapping_at). pub fn try_find_overlapping_at( &self, haystack: &[u8], start: usize, end: usize, state: &mut OverlappingState, ) -> Result, MatchError> { self.try_find_overlapping_at_imp( self.scanner().as_mut(), haystack, start, end, state, ) } /// The implementation of overlapping search at a given range in /// `haystack`, where `scanner` is a prefilter (if active) and `state` is /// the current state of the search. fn try_find_overlapping_at_imp( &self, scanner: Option<&mut prefilter::Scanner>, haystack: &[u8], start: usize, end: usize, state: &mut OverlappingState, ) -> Result, MatchError> { // N.B. We use `&&A` here to call `Automaton` methods, which ensures // that we always use the `impl Automaton for &A` for calling methods. // Since this is the usual way that automata are used, this helps // reduce the number of monomorphized copies of the search code. let (fwd, rev) = (self.forward(), self.reverse()); // TODO: Decide whether it's worth making this assert work. It doesn't // work currently because 'has_starts_for_each_pattern' isn't on the // Automaton trait. Without this assert, we still get a panic, but it's // a bit more inscrutable. // assert!( // rev.has_starts_for_each_pattern(), // "overlapping searches require that the reverse DFA is \ // compiled with the 'starts_for_each_pattern' option", // ); let end = match (&fwd).find_overlapping_fwd_at( scanner, None, haystack, start, end, state, )? { None => return Ok(None), Some(end) => end, }; // Unlike the leftmost cases, the reverse overlapping search may match // a different pattern than the forward search. See test failures when // using `None` instead of `Some(end.pattern())` below. Thus, we must // run our reverse search using the pattern that matched in the forward // direction. let start = (&rev) .find_leftmost_rev_at( Some(end.pattern()), haystack, 0, end.offset(), )? .expect("reverse search must match if forward search does"); assert!(start.offset() <= end.offset()); assert_eq!(start.pattern(), end.pattern()); Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset()))) } } /// Non-search APIs for querying information about the regex and setting a /// prefilter. impl Regex { /// Attach the given prefilter to this regex. pub fn with_prefilter(self, prefilter: Q) -> Regex { Regex { prefilter: Some(prefilter), forward: self.forward, reverse: self.reverse, utf8: self.utf8, } } /// Remove any prefilter from this regex. pub fn without_prefilter(self) -> Regex { Regex { prefilter: None, forward: self.forward, reverse: self.reverse, utf8: self.utf8, } } /// Return the underlying DFA responsible for forward matching. /// /// This is useful for accessing the underlying DFA and converting it to /// some other format or size. See the [`Builder::build_from_dfas`] docs /// for an example of where this might be useful. pub fn forward(&self) -> &A { &self.forward } /// Return the underlying DFA responsible for reverse matching. /// /// This is useful for accessing the underlying DFA and converting it to /// some other format or size. See the [`Builder::build_from_dfas`] docs /// for an example of where this might be useful. pub fn reverse(&self) -> &A { &self.reverse } /// Returns the total number of patterns matched by this regex. /// /// # Example /// /// ``` /// use regex_automata::{MultiMatch, dfa::regex::Regex}; /// /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; /// assert_eq!(3, re.pattern_count()); /// # Ok::<(), Box>(()) /// ``` pub fn pattern_count(&self) -> usize { assert_eq!( self.forward().pattern_count(), self.reverse().pattern_count() ); self.forward().pattern_count() } /// Convenience function for returning this regex's prefilter as a trait /// object. /// /// If this regex doesn't have a prefilter, then `None` is returned. pub fn prefilter(&self) -> Option<&dyn Prefilter> { match self.prefilter { None => None, Some(ref x) => Some(&*x), } } /// Convenience function for returning a prefilter scanner. fn scanner(&self) -> Option { self.prefilter().map(prefilter::Scanner::new) } } /// An iterator over all non-overlapping earliest matches for a particular /// infallible search. /// /// The iterator yields a [`MultiMatch`] value until no more matches could be /// found. If the underlying search returns an error, then this panics. /// /// `A` is the type used to represent the underlying DFAs used by the regex, /// while `P` is the type of prefilter used, if any. The lifetime variables are /// as follows: /// /// * `'r` is the lifetime of the regular expression itself. /// * `'t` is the lifetime of the text being searched. #[derive(Clone, Debug)] pub struct FindEarliestMatches<'r, 't, A, P>( TryFindEarliestMatches<'r, 't, A, P>, ); impl<'r, 't, A: Automaton, P: Prefilter> FindEarliestMatches<'r, 't, A, P> { fn new( re: &'r Regex, text: &'t [u8], ) -> FindEarliestMatches<'r, 't, A, P> { FindEarliestMatches(TryFindEarliestMatches::new(re, text)) } } impl<'r, 't, A: Automaton, P: Prefilter> Iterator for FindEarliestMatches<'r, 't, A, P> { type Item = MultiMatch; fn next(&mut self) -> Option { next_unwrap(self.0.next()) } } /// An iterator over all non-overlapping leftmost matches for a particular /// infallible search. /// /// The iterator yields a [`MultiMatch`] value until no more matches could be /// found. If the underlying search returns an error, then this panics. /// /// `A` is the type used to represent the underlying DFAs used by the regex, /// while `P` is the type of prefilter used, if any. The lifetime variables are /// as follows: /// /// * `'r` is the lifetime of the regular expression itself. /// * `'t` is the lifetime of the text being searched. #[derive(Clone, Debug)] pub struct FindLeftmostMatches<'r, 't, A, P>( TryFindLeftmostMatches<'r, 't, A, P>, ); impl<'r, 't, A: Automaton, P: Prefilter> FindLeftmostMatches<'r, 't, A, P> { fn new( re: &'r Regex, text: &'t [u8], ) -> FindLeftmostMatches<'r, 't, A, P> { FindLeftmostMatches(TryFindLeftmostMatches::new(re, text)) } } impl<'r, 't, A: Automaton, P: Prefilter> Iterator for FindLeftmostMatches<'r, 't, A, P> { type Item = MultiMatch; fn next(&mut self) -> Option { next_unwrap(self.0.next()) } } /// An iterator over all overlapping matches for a particular infallible /// search. /// /// The iterator yields a [`MultiMatch`] value until no more matches could be /// found. If the underlying search returns an error, then this panics. /// /// `A` is the type used to represent the underlying DFAs used by the regex, /// while `P` is the type of prefilter used, if any. The lifetime variables are /// as follows: /// /// * `'r` is the lifetime of the regular expression itself. /// * `'t` is the lifetime of the text being searched. #[derive(Clone, Debug)] pub struct FindOverlappingMatches<'r, 't, A: Automaton, P>( TryFindOverlappingMatches<'r, 't, A, P>, ); impl<'r, 't, A: Automaton, P: Prefilter> FindOverlappingMatches<'r, 't, A, P> { fn new( re: &'r Regex, text: &'t [u8], ) -> FindOverlappingMatches<'r, 't, A, P> { FindOverlappingMatches(TryFindOverlappingMatches::new(re, text)) } } impl<'r, 't, A: Automaton, P: Prefilter> Iterator for FindOverlappingMatches<'r, 't, A, P> { type Item = MultiMatch; fn next(&mut self) -> Option { next_unwrap(self.0.next()) } } /// An iterator over all non-overlapping earliest matches for a particular /// fallible search. /// /// The iterator yields a [`MultiMatch`] value until no more matches could be /// found. /// /// `A` is the type used to represent the underlying DFAs used by the regex, /// while `P` is the type of prefilter used, if any. The lifetime variables are /// as follows: /// /// * `'r` is the lifetime of the regular expression itself. /// * `'t` is the lifetime of the text being searched. #[derive(Clone, Debug)] pub struct TryFindEarliestMatches<'r, 't, A, P> { re: &'r Regex, scanner: Option>, text: &'t [u8], last_end: usize, last_match: Option, } impl<'r, 't, A: Automaton, P: Prefilter> TryFindEarliestMatches<'r, 't, A, P> { fn new( re: &'r Regex, text: &'t [u8], ) -> TryFindEarliestMatches<'r, 't, A, P> { let scanner = re.scanner(); TryFindEarliestMatches { re, scanner, text, last_end: 0, last_match: None, } } } impl<'r, 't, A: Automaton, P: Prefilter> Iterator for TryFindEarliestMatches<'r, 't, A, P> { type Item = Result; fn next(&mut self) -> Option> { if self.last_end > self.text.len() { return None; } let result = self.re.try_find_earliest_at_imp( self.scanner.as_mut(), self.text, self.last_end, self.text.len(), ); let m = match result { Err(err) => return Some(Err(err)), Ok(None) => return None, Ok(Some(m)) => m, }; if m.is_empty() { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. self.last_end = if self.re.utf8 { crate::util::next_utf8(self.text, m.end()) } else { m.end() + 1 }; // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(m.end()) == self.last_match { return self.next(); } } else { self.last_end = m.end(); } self.last_match = Some(m.end()); Some(Ok(m)) } } /// An iterator over all non-overlapping leftmost matches for a particular /// fallible search. /// /// The iterator yields a [`MultiMatch`] value until no more matches could be /// found. /// /// `A` is the type used to represent the underlying DFAs used by the regex, /// while `P` is the type of prefilter used, if any. The lifetime variables are /// as follows: /// /// * `'r` is the lifetime of the regular expression itself. /// * `'t` is the lifetime of the text being searched. #[derive(Clone, Debug)] pub struct TryFindLeftmostMatches<'r, 't, A, P> { re: &'r Regex, scanner: Option>, text: &'t [u8], last_end: usize, last_match: Option, } impl<'r, 't, A: Automaton, P: Prefilter> TryFindLeftmostMatches<'r, 't, A, P> { fn new( re: &'r Regex, text: &'t [u8], ) -> TryFindLeftmostMatches<'r, 't, A, P> { let scanner = re.scanner(); TryFindLeftmostMatches { re, scanner, text, last_end: 0, last_match: None, } } } impl<'r, 't, A: Automaton, P: Prefilter> Iterator for TryFindLeftmostMatches<'r, 't, A, P> { type Item = Result; fn next(&mut self) -> Option> { if self.last_end > self.text.len() { return None; } let result = self.re.try_find_leftmost_at_imp( self.scanner.as_mut(), self.text, self.last_end, self.text.len(), ); let m = match result { Err(err) => return Some(Err(err)), Ok(None) => return None, Ok(Some(m)) => m, }; if m.is_empty() { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. self.last_end = if self.re.utf8 { crate::util::next_utf8(self.text, m.end()) } else { m.end() + 1 }; // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(m.end()) == self.last_match { return self.next(); } } else { self.last_end = m.end(); } self.last_match = Some(m.end()); Some(Ok(m)) } } /// An iterator over all overlapping matches for a particular fallible search. /// /// The iterator yields a [`MultiMatch`] value until no more matches could be /// found. /// /// `A` is the type used to represent the underlying DFAs used by the regex, /// while `P` is the type of prefilter used, if any. The lifetime variables are /// as follows: /// /// * `'r` is the lifetime of the regular expression itself. /// * `'t` is the lifetime of the text being searched. #[derive(Clone, Debug)] pub struct TryFindOverlappingMatches<'r, 't, A: Automaton, P> { re: &'r Regex, scanner: Option>, text: &'t [u8], last_end: usize, state: OverlappingState, } impl<'r, 't, A: Automaton, P: Prefilter> TryFindOverlappingMatches<'r, 't, A, P> { fn new( re: &'r Regex, text: &'t [u8], ) -> TryFindOverlappingMatches<'r, 't, A, P> { let scanner = re.scanner(); TryFindOverlappingMatches { re, scanner, text, last_end: 0, state: OverlappingState::start(), } } } impl<'r, 't, A: Automaton, P: Prefilter> Iterator for TryFindOverlappingMatches<'r, 't, A, P> { type Item = Result; fn next(&mut self) -> Option> { if self.last_end > self.text.len() { return None; } let result = self.re.try_find_overlapping_at_imp( self.scanner.as_mut(), self.text, self.last_end, self.text.len(), &mut self.state, ); let m = match result { Err(err) => return Some(Err(err)), Ok(None) => return None, Ok(Some(m)) => m, }; // Unlike the non-overlapping case, we're OK with empty matches at this // level. In particular, the overlapping search algorithm is itself // responsible for ensuring that progress is always made. self.last_end = m.end(); Some(Ok(m)) } } /// The configuration used for compiling a DFA-backed regex. /// /// A regex configuration is a simple data object that is typically used with /// [`Builder::configure`]. #[cfg(feature = "alloc")] #[derive(Clone, Copy, Debug, Default)] pub struct Config { utf8: Option, } #[cfg(feature = "alloc")] impl Config { /// Return a new default regex compiler configuration. pub fn new() -> Config { Config::default() } /// Whether to enable UTF-8 mode or not. /// /// When UTF-8 mode is enabled (the default) and an empty match is seen, /// the iterators on [`Regex`] will always start the next search at the /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8 /// mode is disabled, such searches are begun at the next byte offset. /// /// If this mode is enabled and invalid UTF-8 is given to search, then /// behavior is unspecified. /// /// Generally speaking, one should enable this when /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) /// and /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) /// are enabled, and disable it otherwise. /// /// # Example /// /// This example demonstrates the differences between when this option is /// enabled and disabled. The differences only arise when the regex can /// return matches of length zero. /// /// In this first snippet, we show the results when UTF-8 mode is disabled. /// /// ``` /// use regex_automata::{dfa::regex::Regex, MultiMatch}; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8(false)) /// .build(r"")?; /// let haystack = "a☃z".as_bytes(); /// let mut it = re.find_leftmost_iter(haystack); /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` /// /// And in this snippet, we execute the same search on the same haystack, /// but with UTF-8 mode enabled. Notice that byte offsets that would /// otherwise split the encoding of `☃` are not returned. /// /// ``` /// use regex_automata::{dfa::regex::Regex, MultiMatch}; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8(true)) /// .build(r"")?; /// let haystack = "a☃z".as_bytes(); /// let mut it = re.find_leftmost_iter(haystack); /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next()); /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` pub fn utf8(mut self, yes: bool) -> Config { self.utf8 = Some(yes); self } /// Returns true if and only if this configuration has UTF-8 mode enabled. /// /// When UTF-8 mode is enabled and an empty match is seen, the iterators on /// [`Regex`] will always start the next search at the next UTF-8 encoded /// codepoint. When UTF-8 mode is disabled, such searches are begun at the /// next byte offset. pub fn get_utf8(&self) -> bool { self.utf8.unwrap_or(true) } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(self, o: Config) -> Config { Config { utf8: o.utf8.or(self.utf8) } } } /// A builder for a regex based on deterministic finite automatons. /// /// This builder permits configuring options for the syntax of a pattern, the /// NFA construction, the DFA construction and finally the regex searching /// itself. This builder is different from a general purpose regex builder in /// that it permits fine grain configuration of the construction process. The /// trade off for this is complexity, and the possibility of setting a /// configuration that might not make sense. For example, there are three /// different UTF-8 modes: /// /// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the /// pattern itself can contain sub-expressions that match invalid UTF-8. /// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) /// controls whether the implicit unanchored prefix added to the NFA can /// match through invalid UTF-8 or not. /// * [`Config::utf8`] controls how the regex iterators themselves advance /// the starting position of the next search when a match with zero length is /// found. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. /// /// Internally, building a regex requires building two DFAs, where one is /// responsible for finding the end of a match and the other is responsible /// for finding the start of a match. If you only need to detect whether /// something matched, or only the end of a match, then you should use a /// [`dense::Builder`] to construct a single DFA, which is cheaper than /// building two DFAs. /// /// # Build methods /// /// This builder has a few "build" methods. In general, it's the result of /// combining the following parameters: /// /// * Building one or many regexes. /// * Building a regex with dense or sparse DFAs. /// /// The simplest "build" method is [`Builder::build`]. It accepts a single /// pattern and builds a dense DFA using `usize` for the state identifier /// representation. /// /// The most general "build" method is [`Builder::build_many`], which permits /// building a regex that searches for multiple patterns simultaneously while /// using a specific state identifier representation. /// /// The most flexible "build" method, but hardest to use, is /// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is /// just a pair of DFAs, and this method allows you to specify those DFAs /// exactly. /// /// # Example /// /// This example shows how to disable UTF-8 mode in the syntax, the NFA and /// the regex itself. This is generally what you want for matching on /// arbitrary bytes. /// /// ``` /// use regex_automata::{ /// dfa::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig /// }; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8(false)) /// .syntax(SyntaxConfig::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(MultiMatch::must(0, 1, 9)); /// let got = re.find_leftmost(haystack); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 /// // on the syntax permits this. Notice also that the /// // search was unanchored and skipped over invalid UTF-8. /// // Disabling UTF-8 on the Thompson NFA permits this. /// // /// // N.B. This example does not show the impact of /// // disabling UTF-8 mode on Config, since that /// // only impacts regexes that can produce matches of /// // length 0. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "alloc")] #[derive(Clone, Debug)] pub struct Builder { config: Config, dfa: dense::Builder, } #[cfg(feature = "alloc")] impl Builder { /// Create a new regex builder with the default configuration. pub fn new() -> Builder { Builder { config: Config::default(), dfa: dense::Builder::new() } } /// Build a regex from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a regex from the given pattern using sparse DFAs. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. pub fn build_sparse( &self, pattern: &str, ) -> Result>>, Error> { self.build_many_sparse(&[pattern]) } /// Build a regex from the given patterns. pub fn build_many>( &self, patterns: &[P], ) -> Result { let forward = self.dfa.build_many(patterns)?; let reverse = self .dfa .clone() .configure( dense::Config::new() .anchored(true) .match_kind(MatchKind::All) .starts_for_each_pattern(true), ) .thompson(thompson::Config::new().reverse(true)) .build_many(patterns)?; Ok(self.build_from_dfas(forward, reverse)) } /// Build a sparse regex from the given patterns. pub fn build_many_sparse>( &self, patterns: &[P], ) -> Result>>, Error> { let re = self.build_many(patterns)?; let forward = re.forward().to_sparse()?; let reverse = re.reverse().to_sparse()?; Ok(self.build_from_dfas(forward, reverse)) } /// Build a regex from its component forward and reverse DFAs. /// /// This is useful when deserializing a regex from some arbitrary /// memory region. This is also useful for building regexes from other /// types of DFAs. /// /// If you're building the DFAs from scratch instead of building new DFAs /// from other DFAs, then you'll need to make sure that the reverse DFA is /// configured correctly to match the intended semantics. Namely: /// /// * It should be anchored. /// * It should use [`MatchKind::All`] semantics. /// * It should match in reverse. /// * It should have anchored start states compiled for each pattern. /// * Otherwise, its configuration should match the forward DFA. /// /// If these conditions are satisfied, then behavior of searches is /// unspecified. /// /// Note that when using this constructor, only the configuration from /// [`Config`] is applied. The only configuration settings on this builder /// only apply when the builder owns the construction of the DFAs /// themselves. /// /// # Example /// /// This example is a bit a contrived. The usual use of these methods /// would involve serializing `initial_re` somewhere and then deserializing /// it later to build a regex. But in this case, we do everything in /// memory. /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// let initial_re = Regex::new("foo[0-9]+")?; /// assert_eq!(true, initial_re.is_match(b"foo123")); /// /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); /// let re = Regex::builder().build_from_dfas(fwd, rev); /// assert_eq!(true, re.is_match(b"foo123")); /// # Ok::<(), Box>(()) /// ``` /// /// This example shows how to build a `Regex` that uses sparse DFAs instead /// of dense DFAs without using one of the convenience `build_sparse` /// routines: /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// let initial_re = Regex::new("foo[0-9]+")?; /// assert_eq!(true, initial_re.is_match(b"foo123")); /// /// let fwd = initial_re.forward().to_sparse()?; /// let rev = initial_re.reverse().to_sparse()?; /// let re = Regex::builder().build_from_dfas(fwd, rev); /// assert_eq!(true, re.is_match(b"foo123")); /// # Ok::<(), Box>(()) /// ``` pub fn build_from_dfas( &self, forward: A, reverse: A, ) -> Regex { let utf8 = self.config.get_utf8(); Regex { prefilter: None, forward, reverse, utf8 } } /// Apply the given regex configuration options to this builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using /// [`SyntaxConfig`](crate::SyntaxConfig). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. pub fn syntax( &mut self, config: crate::util::syntax::SyntaxConfig, ) -> &mut Builder { self.dfa.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](thompson::Config). /// /// This permits setting things like whether additional time should be /// spent shrinking the size of the NFA. pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.dfa.thompson(config); self } /// Set the dense DFA compilation configuration for this builder using /// [`dense::Config`](dense::Config). /// /// This permits setting things like whether the underlying DFAs should /// be minimized. pub fn dense(&mut self, config: dense::Config) -> &mut Builder { self.dfa.configure(config); self } } #[cfg(feature = "alloc")] impl Default for Builder { fn default() -> Builder { Builder::new() } } #[inline(always)] fn next_unwrap( item: Option>, ) -> Option { match item { None => None, Some(Ok(m)) => Some(m), Some(Err(err)) => panic!( "unexpected regex search error: {}\n\ to handle search errors, use try_ methods", err, ), } }