summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/hybrid/regex.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata/src/hybrid/regex.rs')
-rw-r--r--vendor/regex-automata/src/hybrid/regex.rs1925
1 files changed, 348 insertions, 1577 deletions
diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs
index 7cc6b9064..75667daf9 100644
--- a/vendor/regex-automata/src/hybrid/regex.rs
+++ b/vendor/regex-automata/src/hybrid/regex.rs
@@ -1,10 +1,10 @@
/*!
A lazy DFA backed `Regex`.
-This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience
-routines you might have come to expect, such as finding a match and iterating
-over all non-overlapping matches. This `Regex` type is limited in its
-capabilities to what a lazy DFA can provide. Therefore, APIs involving
+This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements
+convenience routines you might have come to expect, such as finding a match
+and iterating over all non-overlapping matches. This `Regex` type is limited
+in its capabilities to what a lazy DFA can provide. Therefore, APIs involving
capturing groups, for example, are not provided.
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
@@ -14,20 +14,15 @@ find the start offset of a match.
See the [parent module](crate::hybrid) for examples.
*/
-use core::borrow::Borrow;
-
-use alloc::boxed::Box;
-
use crate::{
hybrid::{
dfa::{self, DFA},
error::BuildError,
- OverlappingState,
},
nfa::thompson,
util::{
- matchtypes::{MatchError, MatchKind, MultiMatch},
- prefilter::{self, Prefilter},
+ iter,
+ search::{Anchored, Input, Match, MatchError, MatchKind},
},
};
@@ -42,89 +37,20 @@ use crate::{
/// found by the forward DFA guarantees that the reverse DFA will also find
/// a match.
///
-/// A `Regex` can also have a prefilter set via the
-/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is
-/// enabled.
-///
-/// # Earliest vs Leftmost vs Overlapping
-///
-/// The search routines exposed on a `Regex` reflect three different ways
-/// of searching:
-///
-/// * "earliest" means to stop as soon as a match has been detected.
-/// * "leftmost" means to continue matching until the underlying
-/// automaton cannot advance. This reflects "standard" searching you
-/// might be used to in other regex engines. e.g., This permits
-/// non-greedy and greedy searching to work as you would expect.
-/// * "overlapping" means to find all possible matches, even if they
-/// overlap.
-///
-/// Generally speaking, when doing an overlapping search, you'll want to
-/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using
-/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
-/// likely to lead to odd behavior since `LeftmostFirst` specifically omits
-/// some matches that can never be reported due to its semantics.
-///
-/// The following example shows the differences between how these different
-/// types of searches impact looking for matches of `[a-z]+` in the
-/// haystack `abc`.
-///
-/// ```
-/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch};
-///
-/// let pattern = r"[a-z]+";
-/// let haystack = "abc".as_bytes();
-///
-/// // With leftmost-first semantics, we test "earliest" and "leftmost".
-/// let re = regex::Builder::new()
-/// .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst))
-/// .build(pattern)?;
-/// let mut cache = re.create_cache();
-///
-/// // "earliest" searching isn't impacted by greediness
-/// let mut it = re.find_earliest_iter(&mut cache, haystack);
-/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
-/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
-/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
-/// assert_eq!(None, it.next());
-///
-/// // "leftmost" searching supports greediness (and non-greediness)
-/// let mut it = re.find_leftmost_iter(&mut cache, haystack);
-/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
-/// assert_eq!(None, it.next());
-///
-/// // For overlapping, we want "all" match kind semantics.
-/// let re = regex::Builder::new()
-/// .dfa(dfa::Config::new().match_kind(MatchKind::All))
-/// .build(pattern)?;
-/// let mut cache = re.create_cache();
-///
-/// // In the overlapping search, we find all three possible matches
-/// // starting at the beginning of the haystack.
-/// let mut it = re.find_overlapping_iter(&mut cache, haystack);
-/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
-/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
-/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
-/// assert_eq!(None, it.next());
-///
-/// # Ok::<(), Box<dyn std::error::Error>>(())
-/// ```
-///
/// # Fallibility
///
-/// In non-default configurations, the lazy DFAs generated in this module may
-/// return an error during a search. (Currently, the only way this happens is
-/// if quit bytes are added, Unicode word boundaries are heuristically enabled,
-/// or if the cache is configured to "give up" on a search if it has been
-/// cleared too many times. All of these are turned off by default, which means
-/// a search can never fail in the default configuration.) For convenience,
-/// the main search routines, like [`find_leftmost`](Regex::find_leftmost),
-/// will panic if an error occurs. However, if you need to use DFAs which may
-/// produce an error at search time, then there are fallible equivalents of
-/// all search routines. For example, for `find_leftmost`, its fallible analog
-/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed
-/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the
-/// infallible routines simply return `Option<MultiMatch>`.
+/// Most of the search routines defined on this type will _panic_ when the
+/// underlying search fails. This might be because the DFA gave up because it
+/// saw a quit byte, whether configured explicitly or via heuristic Unicode
+/// word boundary support, although neither are enabled by default. It might
+/// also fail if the underlying DFA determines it isn't making effective use of
+/// the cache (which also never happens by default). Or it might fail because
+/// an invalid `Input` configuration is given, for example, with an unsupported
+/// [`Anchored`] mode.
+///
+/// If you need to handle these error cases instead of allowing them to trigger
+/// a panic, then the lower level [`Regex::try_search`] provides a fallible API
+/// that never panics.
///
/// # Example
///
@@ -134,28 +60,26 @@ use crate::{
/// across a line boundary.
///
/// ```
-/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError};
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError};
///
/// let re = Regex::builder()
/// .dfa(dfa::Config::new().quit(b'\n', true))
/// .build(r"foo\p{any}+bar")?;
/// let mut cache = re.create_cache();
///
-/// let haystack = "foo\nbar".as_bytes();
+/// let input = Input::new("foo\nbar");
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
-/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
-/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err();
+/// let expected = MatchError::quit(b'\n', 3);
+/// let got = re.try_search(&mut cache, &input).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Debug)]
pub struct Regex {
- /// An optional prefilter that is passed down to the lazy DFA search
- /// routines when present. By default, no prefilter is set.
- pre: Option<Box<dyn Prefilter>>,
/// The forward lazy DFA. This can only find the end of a match.
forward: DFA,
/// The reverse lazy DFA. This can only find the start of a match.
@@ -169,9 +93,6 @@ pub struct Regex {
/// we might wind up finding the "leftmost" starting position of a totally
/// different pattern!
reverse: DFA,
- /// Whether iterators on this type should advance by one codepoint or one
- /// byte when an empty match is seen.
- utf8: bool,
}
/// Convenience routines for regex and cache construction.
@@ -185,86 +106,49 @@ impl Regex {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// let mut cache = re.create_cache();
/// assert_eq!(
- /// Some(MultiMatch::must(0, 3, 14)),
- /// re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"),
+ /// Some(Match::must(0, 3..14)),
+ /// re.find(&mut cache, "zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
+ #[cfg(feature = "syntax")]
pub fn new(pattern: &str) -> Result<Regex, BuildError> {
Regex::builder().build(pattern)
}
- /// Like `new`, but parses multiple patterns into a single "regex set."
+ /// Like `new`, but parses multiple patterns into a single "multi regex."
/// This similarly uses the default regex configuration.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
/// let mut cache = re.create_cache();
///
- /// let mut it = re.find_leftmost_iter(
- /// &mut cache,
- /// b"abc 1 foo 4567 0 quux",
- /// );
- /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
- /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+ /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
+ /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
+ /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
+ /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
+ /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
+ /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
+ /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
+ #[cfg(feature = "syntax")]
pub fn new_many<P: AsRef<str>>(
patterns: &[P],
) -> Result<Regex, BuildError> {
Regex::builder().build_many(patterns)
}
- /// Return a default configuration for a `Regex`.
- ///
- /// This is a convenience routine to avoid needing to import the `Config`
- /// type when customizing the construction of a regex.
- ///
- /// # Example
- ///
- /// This example shows how to disable UTF-8 mode for `Regex` iteration.
- /// When UTF-8 mode is disabled, the position immediately following an
- /// empty match is where the next search begins, instead of the next
- /// position of a UTF-8 encoded codepoint.
- ///
- /// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .build(r"")?;
- /// let mut cache = re.create_cache();
- ///
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn config() -> Config {
- Config::new()
- }
-
/// Return a builder for configuring the construction of a `Regex`.
///
/// This is a convenience routine to avoid needing to import the
@@ -276,22 +160,20 @@ impl Regex {
/// everywhere.
///
/// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
- /// hybrid::regex::Regex,
- /// nfa::thompson,
- /// MultiMatch, SyntaxConfig,
+ /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .syntax(SyntaxConfig::new().utf8(false))
+ /// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let mut cache = re.create_cache();
///
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
- /// let expected = Some(MultiMatch::must(0, 1, 9));
- /// let got = re.find_leftmost(&mut cache, haystack);
+ /// let expected = Some(Match::must(0, 1..9));
+ /// let got = re.find(&mut cache, haystack);
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -325,15 +207,16 @@ impl Regex {
/// This shows how to re-purpose a cache for use with a different `Regex`.
///
/// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re1 = Regex::new(r"\w")?;
/// let re2 = Regex::new(r"\W")?;
///
/// let mut cache = re1.create_cache();
/// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 2)),
- /// re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+ /// Some(Match::must(0, 0..2)),
+ /// re1.find(&mut cache, "Δ"),
/// );
///
/// // Using 'cache' with re2 is not allowed. It may result in panics or
@@ -344,8 +227,8 @@ impl Regex {
/// // allowed.
/// re2.reset_cache(&mut cache);
/// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 3)),
- /// re2.find_leftmost(&mut cache, "☃".as_bytes()),
+ /// Some(Match::must(0, 0..3)),
+ /// re2.find(&mut cache, "☃"),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -367,78 +250,49 @@ impl Regex {
///
/// # Panics
///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_is_match`](Regex::try_is_match).
- ///
- /// # Example
- ///
- /// ```
- /// use regex_automata::hybrid::regex::Regex;
- ///
- /// let re = Regex::new("foo[0-9]+bar")?;
- /// let mut cache = re.create_cache();
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar"));
- /// assert_eq!(false, re.is_match(&mut cache, b"foobar"));
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool {
- self.try_is_match(cache, haystack).unwrap()
- }
-
- /// Returns the first position at which a match is found.
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest`](Regex::try_find_earliest).
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::hybrid::regex::Regex;
///
- /// // Normally, the leftmost first match would greedily consume as many
- /// // decimal digits as it could. But a match is detected as soon as one
- /// // digit is seen.
- /// let re = Regex::new("foo[0-9]+")?;
+ /// let re = Regex::new("foo[0-9]+bar")?;
/// let mut cache = re.create_cache();
- /// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 4)),
- /// re.find_earliest(&mut cache, b"foo12345"),
- /// );
///
- /// // Normally, the end of the leftmost first match here would be 3,
- /// // but the "earliest" match semantics detect a match earlier.
- /// let re = Regex::new("abc|a")?;
- /// let mut cache = re.create_cache();
- /// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 1)),
- /// re.find_earliest(&mut cache, b"abc"),
- /// );
+ /// assert!(re.is_match(&mut cache, "foo12345bar"));
+ /// assert!(!re.is_match(&mut cache, "foobar"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_earliest(
+ #[inline]
+ pub fn is_match<'h, I: Into<Input<'h>>>(
&self,
cache: &mut Cache,
- haystack: &[u8],
- ) -> Option<MultiMatch> {
- self.try_find_earliest(cache, haystack).unwrap()
+ input: I,
+ ) -> bool {
+ // Not only can we do an "earliest" search, but we can avoid doing a
+ // reverse scan too.
+ self.forward()
+ .try_search_fwd(&mut cache.forward, &input.into().earliest(true))
+ .unwrap()
+ .is_some()
}
/// Returns the start and end offset of the leftmost match. If no match
@@ -446,25 +300,35 @@ impl Regex {
///
/// # Panics
///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// The fallible version of this routine is
- /// [`try_find_leftmost`](Regex::try_find_leftmost).
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
+ ///
+ /// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::{Match, hybrid::regex::Regex};
///
- /// // Greediness is applied appropriately when compared to find_earliest.
/// let re = Regex::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
/// assert_eq!(
- /// Some(MultiMatch::must(0, 3, 11)),
- /// re.find_leftmost(&mut cache, b"zzzfoo12345zzz"),
+ /// Some(Match::must(0, 3..11)),
+ /// re.find(&mut cache, "zzzfoo12345zzz"),
/// );
///
/// // Even though a match is found after reading the first byte (`a`),
@@ -473,892 +337,182 @@ impl Regex {
/// // parts.
/// let re = Regex::new("abc|a")?;
/// let mut cache = re.create_cache();
- /// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 3)),
- /// re.find_leftmost(&mut cache, b"abc"),
- /// );
+ /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_leftmost(
+ #[inline]
+ pub fn find<'h, I: Into<Input<'h>>>(
&self,
cache: &mut Cache,
- haystack: &[u8],
- ) -> Option<MultiMatch> {
- self.try_find_leftmost(cache, haystack).unwrap()
+ input: I,
+ ) -> Option<Match> {
+ self.try_search(cache, &input.into()).unwrap()
}
- /// Search for the first overlapping match in `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping`](Regex::try_find_overlapping).
- ///
- /// # Example
- ///
- /// This example shows how to run an overlapping search with multiple
- /// regexes.
- ///
- /// ```
- /// use regex_automata::{
- /// hybrid::{dfa::DFA, regex::Regex, OverlappingState},
- /// MatchKind,
- /// MultiMatch,
- /// };
- ///
- /// let re = Regex::builder()
- /// .dfa(DFA::config().match_kind(MatchKind::All))
- /// .build_many(&[r"\w+$", r"\S+$"])?;
- /// let mut cache = re.create_cache();
- ///
- /// let haystack = "@foo".as_bytes();
- /// let mut state = OverlappingState::start();
- ///
- /// let expected = Some(MultiMatch::must(1, 0, 4));
- /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
- /// assert_eq!(expected, got);
- ///
- /// // The first pattern also matches at the same position, so re-running
- /// // the search will yield another match. Notice also that the first
- /// // pattern is returned after the second. This is because the second
- /// // pattern begins its match before the first, is therefore an earlier
- /// // match and is thus reported first.
- /// let expected = Some(MultiMatch::must(0, 1, 4));
- /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
- /// assert_eq!(expected, got);
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_overlapping(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- state: &mut OverlappingState,
- ) -> Option<MultiMatch> {
- self.try_find_overlapping(cache, haystack, state).unwrap()
- }
-
- /// Returns an iterator over all non-overlapping "earliest" matches.
- ///
- /// Match positions are reported as soon as a match is known to occur, even
- /// if the standard leftmost match would be longer.
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
///
/// # Panics
///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
- ///
- /// # Example
- ///
- /// This example shows how to run an "earliest" iterator.
- ///
- /// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::new("[0-9]+")?;
- /// let mut cache = re.create_cache();
- /// let haystack = "123".as_bytes();
- ///
- /// // Normally, a standard leftmost iterator would return a single
- /// // match, but since "earliest" detects matches earlier, we get
- /// // three matches.
- /// let mut it = re.find_earliest_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_earliest_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> FindEarliestMatches<'r, 'c, 't> {
- FindEarliestMatches::new(self, cache, haystack)
- }
-
- /// Returns an iterator over all non-overlapping leftmost matches in the
- /// given bytes. If no match exists, then the iterator yields no elements.
+ /// This routine panics if the search could not complete. This can occur
+ /// in a number of circumstances:
///
- /// This corresponds to the "standard" regex search iterator.
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
///
- /// # Panics
+ /// When a search panics, callers cannot know whether a match exists or
+ /// not.
///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
+ /// The above conditions also apply to the iterator returned as well. For
+ /// example, if the lazy DFA gives up or quits during a search using this
+ /// method, then a panic will occur during iteration.
///
- /// The fallible version of this routine is
- /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
+ /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
+ /// if you want to handle these error conditions.
///
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re = Regex::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
///
- /// let text = b"foo1 foo12 foo123";
- /// let matches: Vec<MultiMatch> = re
- /// .find_leftmost_iter(&mut cache, text)
- /// .collect();
+ /// let text = "foo1 foo12 foo123";
+ /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
/// assert_eq!(matches, vec![
- /// MultiMatch::must(0, 0, 4),
- /// MultiMatch::must(0, 5, 10),
- /// MultiMatch::must(0, 11, 17),
+ /// Match::must(0, 0..4),
+ /// Match::must(0, 5..10),
+ /// Match::must(0, 11..17),
/// ]);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn find_leftmost_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> FindLeftmostMatches<'r, 'c, 't> {
- FindLeftmostMatches::new(self, cache, haystack)
- }
-
- /// Returns an iterator over all overlapping matches in the given haystack.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// The iterator takes care of handling the overlapping state that must be
- /// threaded through every search.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
- ///
- /// # Example
- ///
- /// This example shows how to run an overlapping search with multiple
- /// regexes.
- ///
- /// ```
- /// use regex_automata::{
- /// hybrid::{dfa::DFA, regex::Regex},
- /// MatchKind,
- /// MultiMatch,
- /// };
- ///
- /// let re = Regex::builder()
- /// .dfa(DFA::config().match_kind(MatchKind::All))
- /// .build_many(&[r"\w+$", r"\S+$"])?;
- /// let mut cache = re.create_cache();
- /// let haystack = "@foo".as_bytes();
- ///
- /// let mut it = re.find_overlapping_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn find_overlapping_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> FindOverlappingMatches<'r, 'c, 't> {
- FindOverlappingMatches::new(self, cache, haystack)
- }
-}
-
-/// Lower level infallible search routines that permit controlling where
-/// the search starts and ends in a particular sequence. This is useful for
-/// executing searches that need to take surrounding context into account. This
-/// is required for correctly implementing iteration because of look-around
-/// operators (`^`, `$`, `\b`).
-impl Regex {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_is_match_at`](Regex::try_is_match_at).
- pub fn is_match_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- self.try_is_match_at(cache, haystack, start, end).unwrap()
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
- pub fn find_earliest_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Option<MultiMatch> {
- self.try_find_earliest_at(cache, haystack, start, end).unwrap()
- }
-
- /// Returns the same as `find_leftmost`, but starts the search at the given
- /// offset.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches within the
- /// same haystack, which cannot be done correctly by simply providing a
- /// subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
- pub fn find_leftmost_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Option<MultiMatch> {
- self.try_find_leftmost_at(cache, haystack, start, end).unwrap()
- }
-
- /// Search for the first overlapping match within a given range of
- /// `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Panics
- ///
- /// If the underlying lazy DFAs return an error, then this routine panics.
- /// This only occurs in non-default configurations where quit bytes are
- /// used, Unicode word boundaries are heuristically enabled or limits are
- /// set on the number of times the lazy DFA's cache may be cleared.
- ///
- /// The fallible version of this routine is
- /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
- pub fn find_overlapping_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Option<MultiMatch> {
- self.try_find_overlapping_at(cache, haystack, start, end, state)
- .unwrap()
- }
-}
-
-/// Fallible search routines. These may return an error when the underlying
-/// lazy DFAs have been configured in a way that permits them to fail during a
-/// search.
-///
-/// Errors during search only occur when the lazy DFA has been explicitly
-/// configured to do so, usually by specifying one or more "quit" bytes or by
-/// heuristically enabling Unicode word boundaries.
-///
-/// Errors will never be returned using the default configuration. So these
-/// fallible routines are only needed for particular configurations.
-impl Regex {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`is_match`](Regex::is_match).
- pub fn try_is_match(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- ) -> Result<bool, MatchError> {
- self.try_is_match_at(cache, haystack, 0, haystack.len())
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest`](Regex::find_earliest).
- pub fn try_find_earliest(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_earliest_at(cache, haystack, 0, haystack.len())
- }
-
- /// Returns the start and end offset of the leftmost match. If no match
- /// exists, then `None` is returned.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost`](Regex::find_leftmost).
- pub fn try_find_leftmost(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_leftmost_at(cache, haystack, 0, haystack.len())
- }
-
- /// Search for the first overlapping match in `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping`](Regex::find_overlapping).
- pub fn try_find_overlapping(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state)
- }
-
- /// Returns an iterator over all non-overlapping "earliest" matches.
- ///
- /// Match positions are reported as soon as a match is known to occur, even
- /// if the standard leftmost match would be longer.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest_iter`](Regex::find_earliest_iter).
- pub fn try_find_earliest_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> TryFindEarliestMatches<'r, 'c, 't> {
- TryFindEarliestMatches::new(self, cache, haystack)
- }
-
- /// Returns an iterator over all non-overlapping leftmost matches in the
- /// given bytes. If no match exists, then the iterator yields no elements.
- ///
- /// This corresponds to the "standard" regex search iterator.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
- pub fn try_find_leftmost_iter<'r, 'c, 't>(
+ #[inline]
+ pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
&'r self,
cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> TryFindLeftmostMatches<'r, 'c, 't> {
- TryFindLeftmostMatches::new(self, cache, haystack)
- }
-
- /// Returns an iterator over all overlapping matches in the given haystack.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// The iterator takes care of handling the overlapping state that must be
- /// threaded through every search.
- ///
- /// # Errors
- ///
- /// This iterator only yields errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
- pub fn try_find_overlapping_iter<'r, 'c, 't>(
- &'r self,
- cache: &'c mut Cache,
- haystack: &'t [u8],
- ) -> TryFindOverlappingMatches<'r, 'c, 't> {
- TryFindOverlappingMatches::new(self, cache, haystack)
+ input: I,
+ ) -> FindMatches<'r, 'c, 'h> {
+ let it = iter::Searcher::new(input.into());
+ FindMatches { re: self, cache, it }
}
}
-/// Lower level fallible search routines that permit controlling where the
-/// search starts and ends in a particular sequence.
+/// Lower level "search" primitives that accept a `&Input` for cheap reuse
+/// and return an error if one occurs instead of panicking.
impl Regex {
- /// Returns true if and only if this regex matches the given haystack.
- ///
- /// This routine may short circuit if it knows that scanning future input
- /// will never lead to a different result. In particular, if the underlying
- /// DFA enters a match state or a dead state, then this routine will return
- /// `true` or `false`, respectively, without inspecting any future input.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`is_match_at`](Regex::is_match_at).
- pub fn try_is_match_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<bool, MatchError> {
- self.forward()
- .find_leftmost_fwd_at(
- &mut cache.forward,
- self.scanner().as_mut(),
- None,
- haystack,
- start,
- end,
- )
- .map(|x| x.is_some())
- }
-
- /// Returns the first position at which a match is found.
- ///
- /// This routine stops scanning input in precisely the same circumstances
- /// as `is_match`. The key difference is that this routine returns the
- /// position at which it stopped scanning input if and only if a match
- /// was found. If no match is found, then `None` is returned.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_earliest_at`](Regex::find_earliest_at).
- pub fn try_find_earliest_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_earliest_at_imp(
- self.scanner().as_mut(),
- cache,
- haystack,
- start,
- end,
- )
- }
-
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
- /// # Searching a substring of the haystack
+ /// This is like [`Regex::find`] but with two differences:
///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
+ /// 1. It is not generic over `Into<Input>` and instead accepts a
+ /// `&Input`. This permits reusing the same `Input` for multiple searches
+ /// without needing to create a new one. This _may_ help with latency.
+ /// 2. It returns an error if the search could not complete where as
+ /// [`Regex::find`] will panic.
///
/// # Errors
///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
+ /// This routine errors if the search could not complete. This can occur
+ /// in a number of circumstances:
+ ///
+ /// * The configuration of the lazy DFA may permit it to "quit" the search.
+ /// For example, setting quit bytes or enabling heuristic support for
+ /// Unicode word boundaries. The default configuration does not enable any
+ /// option that could result in the lazy DFA quitting.
+ /// * The configuration of the lazy DFA may also permit it to "give up"
+ /// on a search if it makes ineffective use of its transition table
+ /// cache. The default configuration does not enable this by default,
+ /// although it is typically a good idea to.
+ /// * When the provided `Input` configuration is not supported. For
+ /// example, by providing an unsupported anchor mode.
+ ///
+ /// When a search returns an error, callers cannot know whether a match
/// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_leftmost_at`](Regex::find_leftmost_at).
- pub fn try_find_leftmost_at(
+ #[inline]
+ pub fn try_search(
&self,
cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_leftmost_at_imp(
- self.scanner().as_mut(),
- cache,
- haystack,
- start,
- end,
- )
- }
-
- /// Search for the first overlapping match within a given range of
- /// `haystack`.
- ///
- /// This routine is principally useful when searching for multiple patterns
- /// on inputs where multiple patterns may match the same regions of text.
- /// In particular, callers must preserve the automaton's search state from
- /// prior calls so that the implementation knows where the last match
- /// occurred and which pattern was reported.
- ///
- /// # Searching a substring of the haystack
- ///
- /// Being an "at" search routine, this permits callers to search a
- /// substring of `haystack` by specifying a range in `haystack`.
- /// Why expose this as an API instead of just asking callers to use
- /// `&input[start..end]`? The reason is that regex matching often wants
- /// to take the surrounding context into account in order to handle
- /// look-around (`^`, `$` and `\b`).
- ///
- /// This is useful when implementing an iterator over matches
- /// within the same haystack, which cannot be done correctly by simply
- /// providing a subslice of `haystack`.
- ///
- /// # Errors
- ///
- /// This routine only errors if the search could not complete. For
- /// DFA-based regexes, this only occurs in a non-default configuration
- /// where quit bytes are used, Unicode word boundaries are heuristically
- /// enabled or limits are set on the number of times the lazy DFA's cache
- /// may be cleared.
- ///
- /// When a search cannot complete, callers cannot know whether a match
- /// exists or not.
- ///
- /// The infallible (panics on error) version of this routine is
- /// [`find_overlapping_at`](Regex::find_overlapping_at).
- pub fn try_find_overlapping_at(
- &self,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- self.try_find_overlapping_at_imp(
- self.scanner().as_mut(),
- cache,
- haystack,
- start,
- end,
- state,
- )
- }
-}
-
-impl Regex {
- #[inline(always)]
- fn try_find_earliest_at_imp(
- &self,
- pre: Option<&mut prefilter::Scanner>,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- let (fdfa, rdfa) = (self.forward(), self.reverse());
+ input: &Input<'_>,
+ ) -> Result<Option<Match>, MatchError> {
let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
- let end = match fdfa
- .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)?
- {
+ let end = match self.forward().try_search_fwd(fcache, input)? {
None => return Ok(None),
Some(end) => end,
};
- // N.B. The only time we need to tell the reverse searcher the pattern
- // to match is in the overlapping case, since it's ambiguous. In the
- // earliest case, I have tentatively convinced myself that it isn't
- // necessary and the reverse search will always find the same pattern
- // to match as the forward search. But I lack a rigorous proof. Why not
- // just provide the pattern anyway? Well, if it is needed, then leaving
- // it out gives us a chance to find a witness.
- let start = rdfa
- .find_earliest_rev_at(rcache, None, haystack, start, end.offset())?
- .expect("reverse search must match if forward search does");
- assert_eq!(
- start.pattern(),
- end.pattern(),
- "forward and reverse search must match same pattern",
- );
- assert!(start.offset() <= end.offset());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
- }
-
- #[inline(always)]
- fn try_find_leftmost_at_imp(
- &self,
- pre: Option<&mut prefilter::Scanner>,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- ) -> Result<Option<MultiMatch>, MatchError> {
- let (fdfa, rdfa) = (self.forward(), self.reverse());
- let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
- let end = match fdfa
- .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)?
- {
- None => return Ok(None),
- Some(end) => end,
- };
- // N.B. The only time we need to tell the reverse searcher the pattern
- // to match is in the overlapping case, since it's ambiguous. In the
- // leftmost case, I have tentatively convinced myself that it isn't
- // necessary and the reverse search will always find the same pattern
- // to match as the forward search. But I lack a rigorous proof. Why not
- // just provide the pattern anyway? Well, if it is needed, then leaving
- // it out gives us a chance to find a witness.
- let start = rdfa
- .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())?
+ // This special cases an empty match at the beginning of the search. If
+ // our end matches our start, then since a reverse DFA can't match past
+ // the start, it must follow that our starting position is also our end
+ // position. So short circuit and skip the reverse search.
+ if input.start() == end.offset() {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ end.offset()..end.offset(),
+ )));
+ }
+ // We can also skip the reverse search if we know our search was
+ // anchored. This occurs either when the input config is anchored or
+ // when we know the regex itself is anchored. In this case, we know the
+ // start of the match, if one is found, must be the start of the
+ // search.
+ if self.is_anchored(input) {
+ return Ok(Some(Match::new(
+ end.pattern(),
+ input.start()..end.offset(),
+ )));
+ }
+ // N.B. I have tentatively convinced myself that it isn't necessary
+ // to specify the specific pattern for the reverse search since the
+ // reverse search will always find the same pattern to match as the
+ // forward search. But I lack a rigorous proof. Why not just provide
+ // the pattern anyway? Well, if it is needed, then leaving it out
+ // gives us a chance to find a witness. (Also, if we don't need to
+ // specify the pattern, then we don't need to build the reverse DFA
+ // with 'starts_for_each_pattern' enabled. It doesn't matter too much
+ // for the lazy DFA, but does make the overall DFA bigger.)
+ //
+ // We also need to be careful to disable 'earliest' for the reverse
+ // search, since it could be enabled for the forward search. In the
+ // reverse case, to satisfy "leftmost" criteria, we need to match as
+ // much as we can. We also need to be careful to make the search
+ // anchored. We don't want the reverse search to report any matches
+ // other than the one beginning at the end of our forward search.
+ let revsearch = input
+ .clone()
+ .span(input.start()..end.offset())
+ .anchored(Anchored::Yes)
+ .earliest(false);
+ let start = self
+ .reverse()
+ .try_search_rev(rcache, &revsearch)?
.expect("reverse search must match if forward search does");
- assert_eq!(
+ debug_assert_eq!(
start.pattern(),
end.pattern(),
"forward and reverse search must match same pattern",
);
- assert!(start.offset() <= end.offset());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ debug_assert!(start.offset() <= end.offset());
+ Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
}
- #[inline(always)]
- fn try_find_overlapping_at_imp(
- &self,
- pre: Option<&mut prefilter::Scanner>,
- cache: &mut Cache,
- haystack: &[u8],
- start: usize,
- end: usize,
- state: &mut OverlappingState,
- ) -> Result<Option<MultiMatch>, MatchError> {
- let (fdfa, rdfa) = (self.forward(), self.reverse());
- let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
- let end = match fdfa.find_overlapping_fwd_at(
- fcache, pre, None, haystack, start, end, state,
- )? {
- None => return Ok(None),
- Some(end) => end,
- };
- // Unlike the leftmost cases, the reverse overlapping search may match
- // a different pattern than the forward search. See test failures when
- // using `None` instead of `Some(end.pattern())` below. Thus, we must
- // run our reverse search using the pattern that matched in the forward
- // direction.
- let start = rdfa
- .find_leftmost_rev_at(
- rcache,
- Some(end.pattern()),
- haystack,
- 0,
- end.offset(),
- )?
- .expect("reverse search must match if forward search does");
- assert_eq!(
- start.pattern(),
- end.pattern(),
- "forward and reverse search must match same pattern",
- );
- assert!(start.offset() <= end.offset());
- Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ /// Returns true if either the given input specifies an anchored search
+ /// or if the underlying NFA is always anchored.
+ fn is_anchored(&self, input: &Input<'_>) -> bool {
+ match input.get_anchored() {
+ Anchored::No => {
+ self.forward().get_nfa().is_always_start_anchored()
+ }
+ Anchored::Yes | Anchored::Pattern(_) => true,
+ }
}
}
@@ -1386,360 +540,45 @@ impl Regex {
/// # Example
///
/// ```
- /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::hybrid::regex::Regex;
///
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
- /// assert_eq!(3, re.pattern_count());
+ /// assert_eq!(3, re.pattern_len());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
- pub fn pattern_count(&self) -> usize {
- assert_eq!(
- self.forward().pattern_count(),
- self.reverse().pattern_count()
- );
- self.forward().pattern_count()
- }
-
- /// Convenience function for returning this regex's prefilter as a trait
- /// object.
- ///
- /// If this regex doesn't have a prefilter, then `None` is returned.
- pub fn prefilter(&self) -> Option<&dyn Prefilter> {
- self.pre.as_ref().map(|x| &**x)
- }
-
- /// Attach the given prefilter to this regex.
- pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) {
- self.pre = pre;
- }
-
- /// Convenience function for returning a prefilter scanner.
- fn scanner(&self) -> Option<prefilter::Scanner> {
- self.prefilter().map(prefilter::Scanner::new)
+ pub fn pattern_len(&self) -> usize {
+ assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
+ self.forward().pattern_len()
}
}
-/// An iterator over all non-overlapping earliest matches for a particular
-/// infallible search.
+/// An iterator over all non-overlapping matches for an infallible search.
///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
+/// The iterator yields a [`Match`] value until no more matches could be found.
+/// If the underlying regex engine returns an error, then a panic occurs.
///
-/// The lifetime variables are as follows:
+/// The lifetime parameters are as follows:
///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>);
-
-impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> FindEarliestMatches<'r, 'c, 't> {
- FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text))
- }
-}
-
-impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> {
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all non-overlapping leftmost matches for a particular
-/// infallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
-///
-/// The lifetime variables are as follows:
+/// * `'r` represents the lifetime of the regex object.
+/// * `'h` represents the lifetime of the haystack being searched.
+/// * `'c` represents the lifetime of the regex cache.
///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
+/// This iterator can be created with the [`Regex::find_iter`] method.
#[derive(Debug)]
-pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>);
-
-impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> FindLeftmostMatches<'r, 'c, 't> {
- FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text))
- }
-}
-
-impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all overlapping matches for a particular infallible
-/// search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found. If the underlying search returns an error, then this panics.
-///
-/// The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct FindOverlappingMatches<'r, 'c, 't>(
- TryFindOverlappingMatches<'r, 'c, 't>,
-);
-
-impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> FindOverlappingMatches<'r, 'c, 't> {
- FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text))
- }
-}
-
-impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> {
- type Item = MultiMatch;
-
- fn next(&mut self) -> Option<MultiMatch> {
- next_unwrap(self.0.next())
- }
-}
-
-/// An iterator over all non-overlapping earliest matches for a particular
-/// fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct TryFindEarliestMatches<'r, 'c, 't> {
- re: &'r Regex,
- cache: &'c mut Cache,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- last_match: Option<usize>,
-}
-
-impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> TryFindEarliestMatches<'r, 'c, 't> {
- let scanner = re.scanner();
- TryFindEarliestMatches {
- re,
- cache,
- scanner,
- text,
- last_end: 0,
- last_match: None,
- }
- }
-}
-
-impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> {
- type Item = Result<MultiMatch, MatchError>;
-
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_earliest_at_imp(
- self.scanner.as_mut(),
- self.cache,
- self.text,
- self.last_end,
- self.text.len(),
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- if m.is_empty() {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = if self.re.utf8 {
- crate::util::next_utf8(self.text, m.end())
- } else {
- m.end() + 1
- };
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(m.end()) == self.last_match {
- return self.next();
- }
- } else {
- self.last_end = m.end();
- }
- self.last_match = Some(m.end());
- Some(Ok(m))
- }
-}
-
-/// An iterator over all non-overlapping leftmost matches for a particular
-/// fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct TryFindLeftmostMatches<'r, 'c, 't> {
- re: &'r Regex,
- cache: &'c mut Cache,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- last_match: Option<usize>,
-}
-
-impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> TryFindLeftmostMatches<'r, 'c, 't> {
- let scanner = re.scanner();
- TryFindLeftmostMatches {
- re,
- cache,
- scanner,
- text,
- last_end: 0,
- last_match: None,
- }
- }
-}
-
-impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> {
- type Item = Result<MultiMatch, MatchError>;
-
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_leftmost_at_imp(
- self.scanner.as_mut(),
- self.cache,
- self.text,
- self.last_end,
- self.text.len(),
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- if m.is_empty() {
- // This is an empty match. To ensure we make progress, start
- // the next search at the smallest possible starting position
- // of the next match following this one.
- self.last_end = if self.re.utf8 {
- crate::util::next_utf8(self.text, m.end())
- } else {
- m.end() + 1
- };
- // Don't accept empty matches immediately following a match.
- // Just move on to the next match.
- if Some(m.end()) == self.last_match {
- return self.next();
- }
- } else {
- self.last_end = m.end();
- }
- self.last_match = Some(m.end());
- Some(Ok(m))
- }
-}
-
-/// An iterator over all overlapping matches for a particular fallible search.
-///
-/// The iterator yields a [`MultiMatch`] value until no more matches could be
-/// found.
-///
-/// The lifetime variables are as follows:
-///
-/// * `'r` is the lifetime of the regular expression itself.
-/// * `'c` is the lifetime of the mutable cache used during search.
-/// * `'t` is the lifetime of the text being searched.
-#[derive(Debug)]
-pub struct TryFindOverlappingMatches<'r, 'c, 't> {
+pub struct FindMatches<'r, 'c, 'h> {
re: &'r Regex,
cache: &'c mut Cache,
- scanner: Option<prefilter::Scanner<'r>>,
- text: &'t [u8],
- last_end: usize,
- state: OverlappingState,
+ it: iter::Searcher<'h>,
}
-impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> {
- fn new(
- re: &'r Regex,
- cache: &'c mut Cache,
- text: &'t [u8],
- ) -> TryFindOverlappingMatches<'r, 'c, 't> {
- let scanner = re.scanner();
- TryFindOverlappingMatches {
- re,
- cache,
- scanner,
- text,
- last_end: 0,
- state: OverlappingState::start(),
- }
- }
-}
+impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
+ type Item = Match;
-impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> {
- type Item = Result<MultiMatch, MatchError>;
-
- fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
- if self.last_end > self.text.len() {
- return None;
- }
- let result = self.re.try_find_overlapping_at_imp(
- self.scanner.as_mut(),
- self.cache,
- self.text,
- self.last_end,
- self.text.len(),
- &mut self.state,
- );
- let m = match result {
- Err(err) => return Some(Err(err)),
- Ok(None) => return None,
- Ok(Some(m)) => m,
- };
- // Unlike the non-overlapping case, we're OK with empty matches at this
- // level. In particular, the overlapping search algorithm is itself
- // responsible for ensuring that progress is always made.
- self.last_end = m.end();
- Some(Ok(m))
+ #[inline]
+ fn next(&mut self) -> Option<Match> {
+ let FindMatches { re, ref mut cache, ref mut it } = *self;
+ it.advance(|input| re.try_search(cache, input))
}
}
@@ -1791,15 +630,16 @@ impl Cache {
/// This shows how to re-purpose a cache for use with a different `Regex`.
///
/// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::{hybrid::regex::Regex, Match};
///
/// let re1 = Regex::new(r"\w")?;
/// let re2 = Regex::new(r"\W")?;
///
/// let mut cache = re1.create_cache();
/// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 2)),
- /// re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+ /// Some(Match::must(0, 0..2)),
+ /// re1.find(&mut cache, "Δ"),
/// );
///
/// // Using 'cache' with re2 is not allowed. It may result in panics or
@@ -1810,8 +650,8 @@ impl Cache {
/// // allowed.
/// cache.reset(&re2);
/// assert_eq!(
- /// Some(MultiMatch::must(0, 0, 3)),
- /// re2.find_leftmost(&mut cache, "☃".as_bytes()),
+ /// Some(Match::must(0, 0..3)),
+ /// re2.find(&mut cache, "☃"),
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
@@ -1821,13 +661,30 @@ impl Cache {
self.reverse.reset(re.reverse());
}
- /// Returns the heap memory usage, in bytes, as a sum of the forward and
- /// reverse lazy DFA caches.
+ /// Return a reference to the forward cache.
+ pub fn forward(&mut self) -> &dfa::Cache {
+ &self.forward
+ }
+
+ /// Return a reference to the reverse cache.
+ pub fn reverse(&mut self) -> &dfa::Cache {
+ &self.reverse
+ }
+
+ /// Return a mutable reference to the forward cache.
///
- /// This does **not** include the stack size used up by this cache. To
- /// compute that, use `std::mem::size_of::<Cache>()`.
- pub fn memory_usage(&self) -> usize {
- self.forward.memory_usage() + self.reverse.memory_usage()
+ /// If you need mutable references to both the forward and reverse caches,
+ /// then use [`Cache::as_parts_mut`].
+ pub fn forward_mut(&mut self) -> &mut dfa::Cache {
+ &mut self.forward
+ }
+
+ /// Return a mutable reference to the reverse cache.
+ ///
+ /// If you need mutable references to both the forward and reverse caches,
+ /// then use [`Cache::as_parts_mut`].
+ pub fn reverse_mut(&mut self) -> &mut dfa::Cache {
+ &mut self.reverse
}
/// Return references to the forward and reverse caches, respectively.
@@ -1840,111 +697,14 @@ impl Cache {
pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
(&mut self.forward, &mut self.reverse)
}
-}
-/// The configuration used for compiling a hybrid NFA/DFA regex.
-///
-/// A regex configuration is a simple data object that is typically used with
-/// [`Builder::configure`].
-#[derive(Clone, Copy, Debug, Default)]
-pub struct Config {
- utf8: Option<bool>,
-}
-
-impl Config {
- /// Return a new default regex compiler configuration.
- pub fn new() -> Config {
- Config::default()
- }
-
- /// Whether to enable UTF-8 mode or not.
- ///
- /// When UTF-8 mode is enabled (the default) and an empty match is seen,
- /// the iterators on [`Regex`] will always start the next search at the
- /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
- /// mode is disabled, such searches are begun at the next byte offset.
- ///
- /// If this mode is enabled and invalid UTF-8 is given to search, then
- /// behavior is unspecified.
- ///
- /// Generally speaking, one should enable this when
- /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
- /// and
- /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
- /// are enabled, and disable it otherwise.
- ///
- /// # Example
- ///
- /// This example demonstrates the differences between when this option is
- /// enabled and disabled. The differences only arise when the regex can
- /// return matches of length zero.
- ///
- /// In this first snippet, we show the results when UTF-8 mode is disabled.
- ///
- /// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(false))
- /// .build(r"")?;
- /// let mut cache = re.create_cache();
- ///
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
- ///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- ///
- /// And in this snippet, we execute the same search on the same haystack,
- /// but with UTF-8 mode enabled. Notice that byte offsets that would
- /// otherwise split the encoding of `☃` are not returned.
- ///
- /// ```
- /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
- ///
- /// let re = Regex::builder()
- /// .configure(Regex::config().utf8(true))
- /// .build(r"")?;
- /// let mut cache = re.create_cache();
- ///
- /// let haystack = "a☃z".as_bytes();
- /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
- /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
- /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
- /// assert_eq!(None, it.next());
+ /// Returns the heap memory usage, in bytes, as a sum of the forward and
+ /// reverse lazy DFA caches.
///
- /// # Ok::<(), Box<dyn std::error::Error>>(())
- /// ```
- pub fn utf8(mut self, yes: bool) -> Config {
- self.utf8 = Some(yes);
- self
- }
-
- /// Returns true if and only if this configuration has UTF-8 mode enabled.
- ///
- /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
- /// [`Regex`] will always start the next search at the next UTF-8 encoded
- /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
- /// next byte offset.
- pub fn get_utf8(&self) -> bool {
- self.utf8.unwrap_or(true)
- }
-
- /// Overwrite the default configuration such that the options in `o` are
- /// always used. If an option in `o` is not set, then the corresponding
- /// option in `self` is used. If it's not set in `self` either, then it
- /// remains not set.
- pub(crate) fn overwrite(self, o: Config) -> Config {
- Config { utf8: o.utf8.or(self.utf8) }
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.forward.memory_usage() + self.reverse.memory_usage()
}
}
@@ -1955,17 +715,15 @@ impl Config {
/// itself. This builder is different from a general purpose regex builder
/// in that it permits fine grain configuration of the construction process.
/// The trade off for this is complexity, and the possibility of setting a
-/// configuration that might not make sense. For example, there are three
+/// configuration that might not make sense. For example, there are two
/// different UTF-8 modes:
///
-/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
-/// pattern itself can contain sub-expressions that match invalid UTF-8.
-/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
-/// controls whether the implicit unanchored prefix added to the NFA can
-/// match through invalid UTF-8 or not.
-/// * [`Config::utf8`] controls how the regex iterators themselves advance
-/// the starting position of the next search when a match with zero length is
-/// found.
+/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
+/// whether the pattern itself can contain sub-expressions that match invalid
+/// UTF-8.
+/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
+/// advance the starting position of the next search when a match with zero
+/// length is found.
///
/// Generally speaking, callers will want to either enable all of these or
/// disable all of these.
@@ -1979,61 +737,54 @@ impl Config {
///
/// # Example
///
-/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
-/// the regex itself. This is generally what you want for matching on
-/// arbitrary bytes.
+/// This example shows how to disable UTF-8 mode in the syntax and the regex
+/// itself. This is generally what you want for matching on arbitrary bytes.
///
/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
-/// hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
+/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
-/// .configure(Regex::config().utf8(false))
-/// .syntax(SyntaxConfig::new().utf8(false))
+/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let mut cache = re.create_cache();
///
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
-/// let expected = Some(MultiMatch::must(0, 1, 9));
-/// let got = re.find_leftmost(&mut cache, haystack);
+/// let expected = Some(Match::must(0, 1..9));
+/// let got = re.find(&mut cache, haystack);
/// assert_eq!(expected, got);
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
/// // but the subsequent `.*` does not! Disabling UTF-8
-/// // on the syntax permits this. Notice also that the
-/// // search was unanchored and skipped over invalid UTF-8.
-/// // Disabling UTF-8 on the Thompson NFA permits this.
-/// //
-/// // N.B. This example does not show the impact of
-/// // disabling UTF-8 mode on Config, since that
-/// // only impacts regexes that can produce matches of
-/// // length 0.
+/// // on the syntax permits this.
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct Builder {
- config: Config,
dfa: dfa::Builder,
}
impl Builder {
/// Create a new regex builder with the default configuration.
pub fn new() -> Builder {
- Builder { config: Config::default(), dfa: DFA::builder() }
+ Builder { dfa: DFA::builder() }
}
/// Build a regex from the given pattern.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
+ #[cfg(feature = "syntax")]
pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
self.build_many(&[pattern])
}
/// Build a regex from the given patterns.
+ #[cfg(feature = "syntax")]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
@@ -2044,9 +795,9 @@ impl Builder {
.clone()
.configure(
DFA::config()
- .anchored(true)
- .match_kind(MatchKind::All)
- .starts_for_each_pattern(true),
+ .prefilter(None)
+ .specialize_start_states(false)
+ .match_kind(MatchKind::All),
)
.thompson(thompson::Config::new().reverse(true))
.build_many(patterns)?;
@@ -2054,28 +805,62 @@ impl Builder {
}
/// Build a regex from its component forward and reverse hybrid NFA/DFAs.
- fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
- // The congruous method on DFA-backed regexes is exposed, but it's
- // not clear this builder is useful here since lazy DFAs can't be
- // serialized and there is only one type of them.
- let utf8 = self.config.get_utf8();
- Regex { pre: None, forward, reverse, utf8 }
- }
-
- /// Apply the given regex configuration options to this builder.
- pub fn configure(&mut self, config: Config) -> &mut Builder {
- self.config = self.config.overwrite(config);
- self
+ ///
+ /// This is useful when you've built a forward and reverse lazy DFA
+ /// separately, and want to combine them into a single regex. Once build,
+ /// the individual DFAs given can still be accessed via [`Regex::forward`]
+ /// and [`Regex::reverse`].
+ ///
+ /// It is important that the reverse lazy DFA be compiled under the
+ /// following conditions:
+ ///
+ /// * It should use [`MatchKind::All`] semantics.
+ /// * It should match in reverse.
+ /// * Otherwise, its configuration should match the forward DFA.
+ ///
+ /// If these conditions aren't satisfied, then the behavior of searches is
+ /// unspecified.
+ ///
+ /// Note that when using this constructor, no configuration is applied.
+ /// Since this routine provides the DFAs to the builder, there is no
+ /// opportunity to apply other configuration options.
+ ///
+ /// # Example
+ ///
+ /// This shows how to build individual lazy forward and reverse DFAs, and
+ /// then combine them into a single `Regex`.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, regex::Regex},
+ /// nfa::thompson,
+ /// MatchKind,
+ /// };
+ ///
+ /// let fwd = DFA::new(r"foo[0-9]+")?;
+ /// let rev = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build(r"foo[0-9]+")?;
+ ///
+ /// let re = Regex::builder().build_from_dfas(fwd, rev);
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(true, re.is_match(&mut cache, "foo123"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
+ Regex { forward, reverse }
}
/// Set the syntax configuration for this builder using
- /// [`SyntaxConfig`](crate::SyntaxConfig).
+ /// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
+ #[cfg(feature = "syntax")]
pub fn syntax(
&mut self,
- config: crate::util::syntax::SyntaxConfig,
+ config: crate::util::syntax::Config,
) -> &mut Builder {
self.dfa.syntax(config);
self
@@ -2086,6 +871,7 @@ impl Builder {
///
/// This permits setting things like whether additional time should be
/// spent shrinking the size of the NFA.
+ #[cfg(feature = "syntax")]
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
self.dfa.thompson(config);
self
@@ -2107,18 +893,3 @@ impl Default for Builder {
Builder::new()
}
}
-
-#[inline(always)]
-fn next_unwrap(
- item: Option<Result<MultiMatch, MatchError>>,
-) -> Option<MultiMatch> {
- match item {
- None => None,
- Some(Ok(m)) => Some(m),
- Some(Err(err)) => panic!(
- "unexpected regex search error: {}\n\
- to handle search errors, use try_ methods",
- err,
- ),
- }
-}