summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/hybrid
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:18:32 +0000
commit4547b622d8d29df964fa2914213088b148c498fc (patch)
tree9fc6b25f3c3add6b745be9a2400a6e96140046e9 /vendor/regex-automata/src/hybrid
parentReleasing progress-linux version 1.66.0+dfsg1-1~progress7.99u1. (diff)
downloadrustc-4547b622d8d29df964fa2914213088b148c498fc.tar.xz
rustc-4547b622d8d29df964fa2914213088b148c498fc.zip
Merging upstream version 1.67.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/hybrid')
-rw-r--r--vendor/regex-automata/src/hybrid/dfa.rs3817
-rw-r--r--vendor/regex-automata/src/hybrid/error.rs130
-rw-r--r--vendor/regex-automata/src/hybrid/id.rs415
-rw-r--r--vendor/regex-automata/src/hybrid/mod.rs179
-rw-r--r--vendor/regex-automata/src/hybrid/regex.rs2124
-rw-r--r--vendor/regex-automata/src/hybrid/search.rs663
6 files changed, 7328 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/hybrid/dfa.rs b/vendor/regex-automata/src/hybrid/dfa.rs
new file mode 100644
index 000000000..1fbce5f5f
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/dfa.rs
@@ -0,0 +1,3817 @@
+/*!
+Types and routines specific to lazy DFAs.
+
+This module is the home of [`hybrid::dfa::DFA`](DFA).
+
+This module also contains a [`hybrid::dfa::Builder`](Builder) and a
+[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
+*/
+
+use core::{borrow::Borrow, iter, mem::size_of};
+
+use alloc::{sync::Arc, vec::Vec};
+
+use crate::{
+ hybrid::{
+ error::{BuildError, CacheError},
+ id::{LazyStateID, LazyStateIDError, OverlappingState},
+ search,
+ },
+ nfa::thompson,
+ util::{
+ alphabet::{self, ByteClasses, ByteSet},
+ determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
+ id::{PatternID, StateID as NFAStateID},
+ matchtypes::{HalfMatch, MatchError, MatchKind},
+ prefilter,
+ sparse_set::SparseSets,
+ start::Start,
+ },
+};
+
+/// The mininum number of states that a lazy DFA's cache size must support.
+///
+/// This is checked at time of construction to ensure that at least some small
+/// number of states can fit in the given capacity allotment. If we can't fit
+/// at least this number of states, then the thinking is that it's pretty
+/// senseless to use the lazy DFA. More to the point, parts of the code do
+/// assume that the cache can fit at least some small number of states.
+const MIN_STATES: usize = 5;
+
+/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
+///
+/// A lazy DFA is a DFA that builds itself at search time. It otherwise has
+/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
+/// Indeed, both support precisely the same regex features with precisely the
+/// same semantics.
+///
+/// Where as a `dense::DFA` must be completely built to handle any input before
+/// it may be used for search, a lazy DFA starts off effectively empty. During
+/// a search, a lazy DFA will build itself depending on whether it has already
+/// computed the next transition or not. If it has, then it looks a lot like
+/// a `dense::DFA` internally: it does a very fast table based access to find
+/// the next transition. Otherwise, if the state hasn't been computed, then it
+/// does determinization _for that specific transition_ to compute the next DFA
+/// state.
+///
+/// The main selling point of a lazy DFA is that, in practice, it has
+/// the performance profile of a `dense::DFA` without the weakness of it
+/// taking worst case exponential time to build. Indeed, for each byte of
+/// input, the lazy DFA will construct as most one new DFA state. Thus, a
+/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
+/// pattern.len()` and `n ~ haystack.len()`).
+///
+/// The main downsides of a lazy DFA are:
+///
+/// 1. It requires mutable "cache" space during search. This is where the
+/// transition table, among other things, is stored.
+/// 2. In pathological cases (e.g., if the cache is too small), it will run
+/// out of room and either require a bigger cache capacity or will repeatedly
+/// clear the cache and thus repeatedly regenerate DFA states. Overall, this
+/// will tend to be slower than a typical NFA simulation.
+///
+/// # Capabilities
+///
+/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
+/// operations:
+///
+/// 1. Detection of a match.
+/// 2. Location of the end of a match.
+/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
+/// is reported as well.
+///
+/// A notable absence from the above list of capabilities is the location of
+/// the *start* of a match. In order to provide both the start and end of
+/// a match, *two* lazy DFAs are required. This functionality is provided by a
+/// [`Regex`](crate::hybrid::regex::Regex).
+///
+/// # Example
+///
+/// This shows how to build a lazy DFA with the default configuration and
+/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
+/// a cache and pass it to our search routine.
+///
+/// ```
+/// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+///
+/// let dfa = DFA::new("foo[0-9]+")?;
+/// let mut cache = dfa.create_cache();
+///
+/// let expected = Some(HalfMatch::must(0, 8));
+/// assert_eq!(expected, dfa.find_leftmost_fwd(&mut cache, b"foo12345")?);
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct DFA {
+ nfa: Arc<thompson::NFA>,
+ stride2: usize,
+ classes: ByteClasses,
+ quitset: ByteSet,
+ anchored: bool,
+ match_kind: MatchKind,
+ starts_for_each_pattern: bool,
+ cache_capacity: usize,
+ minimum_cache_clear_count: Option<usize>,
+}
+
+impl DFA {
+ /// Parse the given regular expression using a default configuration and
+ /// return the corresponding lazy DFA.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::new("foo[0-9]+bar")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let expected = HalfMatch::must(0, 11);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?,
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<DFA, BuildError> {
+ DFA::builder().build(pattern)
+ }
+
+ /// Parse the given regular expressions using a default configuration and
+ /// return the corresponding lazy multi-DFA.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let expected = HalfMatch::must(1, 3);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_leftmost_fwd(&mut cache, b"foo12345bar")?,
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
+ DFA::builder().build_many(patterns)
+ }
+
+ /// Create a new lazy DFA that matches every input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::always_match()?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"")?);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"foo")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> Result<DFA, BuildError> {
+ let nfa = thompson::NFA::always_match();
+ Builder::new().build_from_nfa(Arc::new(nfa))
+ }
+
+ /// Create a new lazy DFA that never matches any input.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::never_match()?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"")?);
+ /// assert_eq!(None, dfa.find_leftmost_fwd(&mut cache, b"foo")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> Result<DFA, BuildError> {
+ let nfa = thompson::NFA::never_match();
+ Builder::new().build_from_nfa(Arc::new(nfa))
+ }
+
+ /// Return a default configuration for a `DFA`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a lazy DFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a lazy DFA that only executes searches
+ /// in anchored mode.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let re = DFA::builder()
+ /// .configure(DFA::config().anchored(true))
+ /// .build(r"[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "abc123xyz".as_bytes();
+ /// assert_eq!(None, re.find_leftmost_fwd(&mut cache, haystack)?);
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 3)),
+ /// re.find_leftmost_fwd(&mut cache, &haystack[3..6])?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere for lazy DFAs. This includes disabling it for both the
+ /// concrete syntax (e.g., `.` matches any byte and Unicode character
+ /// classes like `\p{Letter}` are not allowed) and for the unanchored
+ /// search prefix. The latter enables the regex to match anywhere in a
+ /// sequence of arbitrary bytes. (Typically, the unanchored search prefix
+ /// will only permit matching valid UTF-8.)
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// nfa::thompson,
+ /// HalfMatch, SyntaxConfig,
+ /// };
+ ///
+ /// let re = DFA::builder()
+ /// .syntax(SyntaxConfig::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(HalfMatch::must(0, 9));
+ /// let got = re.find_leftmost_fwd(&mut cache, haystack)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Create a new cache for this lazy DFA.
+ ///
+ /// The cache returned should only be used for searches for this
+ /// lazy DFA. If you want to reuse the cache for another DFA, then
+ /// you must call [`Cache::reset`] with that DFA (or, equivalently,
+ /// [`DFA::reset_cache`]).
+ pub fn create_cache(&self) -> Cache {
+ Cache::new(self)
+ }
+
+ /// Reset the given cache such that it can be used for searching with the
+ /// this lazy DFA (and only this DFA).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different lazy DFA.
+ ///
+ /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+ /// lazy DFA has been configured to "give up" after it has cleared the
+ /// cache a certain number of times.
+ ///
+ /// Any lazy state ID generated by the cache prior to resetting it is
+ /// invalid after the reset.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different DFA.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa1 = DFA::new(r"\w")?;
+ /// let dfa2 = DFA::new(r"\W")?;
+ ///
+ /// let mut cache = dfa1.create_cache();
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 2)),
+ /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?,
+ /// );
+ ///
+ /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the DFA we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 'dfa1' is also not
+ /// // allowed.
+ /// dfa2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 3)),
+ /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset_cache(&self, cache: &mut Cache) {
+ Lazy::new(self, cache).reset_cache()
+ }
+
+ /// Returns the total number of patterns compiled into this lazy DFA.
+ ///
+ /// In the case of a DFA that contains no patterns, this returns `0`.
+ ///
+ /// # Example
+ ///
+ /// This example shows the pattern count for a DFA that never matches:
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::never_match()?;
+ /// assert_eq!(dfa.pattern_count(), 0);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And another example for a DFA that matches at every position:
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::always_match()?;
+ /// assert_eq!(dfa.pattern_count(), 1);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And finally, a DFA that was constructed from multiple patterns:
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(dfa.pattern_count(), 3);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_count(&self) -> usize {
+ self.nfa.pattern_len()
+ }
+
+ /// Returns a reference to the underlying NFA.
+ pub fn nfa(&self) -> &Arc<thompson::NFA> {
+ &self.nfa
+ }
+
+ /// Returns the stride, as a base-2 exponent, required for these
+ /// equivalence classes.
+ ///
+ /// The stride is always the smallest power of 2 that is greater than or
+ /// equal to the alphabet length. This is done so that converting between
+ /// state IDs and indices can be done with shifts alone, which is much
+ /// faster than integer division.
+ fn stride2(&self) -> usize {
+ self.stride2
+ }
+
+ /// Returns the total stride for every state in this lazy DFA. This
+ /// corresponds to the total number of transitions used by each state in
+ /// this DFA's transition table.
+ fn stride(&self) -> usize {
+ 1 << self.stride2()
+ }
+
+ /// Returns the total number of elements in the alphabet for this
+ /// transition table. This is always less than or equal to `self.stride()`.
+ /// It is only equal when the alphabet length is a power of 2. Otherwise,
+ /// it is always strictly less.
+ fn alphabet_len(&self) -> usize {
+ self.classes.alphabet_len()
+ }
+
+ /// Returns the memory usage, in bytes, of this lazy DFA.
+ ///
+ /// This does **not** include the stack size used up by this lazy DFA. To
+ /// compute that, use `std::mem::size_of::<DFA>()`. This also does
+ /// not include the size of the `Cache` used.
+ pub fn memory_usage(&self) -> usize {
+ // Everything else is on the stack.
+ self.nfa.memory_usage()
+ }
+}
+
+impl DFA {
+ /// Executes a forward search and returns the end position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state. This is useful for implementing boolean `is_match`-like
+ /// routines, where as little work is done as possible.
+ ///
+ /// See [`DFA::find_earliest_fwd_at`] for additional functionality, such as
+ /// providing a prefilter, a specific pattern to match and the bounds of
+ /// the search within the haystack. This routine is meant as a convenience
+ /// for common cases where the additional functionality is not needed.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates how the position returned might differ from
+ /// what one might expect when executing a traditional leftmost search.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::new("foo[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// // Normally, the end of the leftmost first match here would be 8,
+ /// // corresponding to the end of the input. But the "earliest" semantics
+ /// // this routine cause it to stop as soon as a match is known, which
+ /// // occurs once 'foo[0-9]' has matched.
+ /// let expected = HalfMatch::must(0, 4);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_earliest_fwd(&mut cache, b"foo12345")?,
+ /// );
+ ///
+ /// let dfa = DFA::new("abc|a")?;
+ /// let mut cache = dfa.create_cache();
+ /// // Normally, the end of the leftmost first match here would be 3,
+ /// // but the shortest match semantics detect a match earlier.
+ /// let expected = HalfMatch::must(0, 1);
+ /// assert_eq!(Some(expected), dfa.find_earliest_fwd(&mut cache, b"abc")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_earliest_fwd(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_earliest_fwd_at(cache, None, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a reverse search and returns the start position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state.
+ ///
+ /// Note that while it is not technically necessary to build a reverse
+ /// automaton to use a reverse search, it is likely that you'll want to do
+ /// so. Namely, the typical use of a reverse search is to find the starting
+ /// location of a match once its end is discovered from a forward search. A
+ /// reverse DFA automaton can be built by configuring the intermediate NFA
+ /// to be reversed via
+ /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse).
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates how the position returned might differ from
+ /// what one might expect when executing a traditional leftmost reverse
+ /// search.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch};
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("[a-z]+[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// // Normally, the end of the leftmost first match here would be 0,
+ /// // corresponding to the beginning of the input. But the "earliest"
+ /// // semantics of this routine cause it to stop as soon as a match is
+ /// // known, which occurs once '[a-z][0-9]+' has matched.
+ /// let expected = HalfMatch::must(0, 2);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_earliest_rev(&mut cache, b"foo12345")?,
+ /// );
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("abc|c")?;
+ /// let mut cache = dfa.create_cache();
+ /// // Normally, the end of the leftmost first match here would be 0,
+ /// // but the shortest match semantics detect a match earlier.
+ /// let expected = HalfMatch::must(0, 2);
+ /// assert_eq!(Some(expected), dfa.find_earliest_rev(&mut cache, b"abc")?);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_earliest_rev(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_earliest_rev_at(cache, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a forward search and returns the end position of the leftmost
+ /// match that is found. If no match exists, then `None` is returned.
+ ///
+ /// In particular, this method continues searching even after it enters
+ /// a match state. The search only terminates once it has reached the
+ /// end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// Leftmost first match semantics corresponds to the match with the
+ /// smallest starting offset, but where the end offset is determined by
+ /// preferring earlier branches in the original regular expression. For
+ /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
+ /// will match `Samwise` in `Samwise`.
+ ///
+ /// Generally speaking, the "leftmost first" match is how most backtracking
+ /// regular expressions tend to work. This is in contrast to POSIX-style
+ /// regular expressions that yield "leftmost longest" matches. Namely,
+ /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
+ /// leftmost longest semantics. (This crate does not currently support
+ /// leftmost longest semantics.)
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::new("foo[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = HalfMatch::must(0, 8);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_leftmost_fwd(&mut cache, b"foo12345")?,
+ /// );
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = DFA::new("abc|a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = HalfMatch::must(0, 3);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(&mut cache, b"abc")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_leftmost_fwd(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_leftmost_fwd_at(cache, None, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes a reverse search and returns the start of the position of the
+ /// leftmost match that is found. If no match exists, then `None` is
+ /// returned.
+ ///
+ /// In particular, this method continues searching even after it enters
+ /// a match state. The search only terminates once it has reached the
+ /// end of the input or when it has entered a dead or quit state. Upon
+ /// termination, the position of the last byte seen while still in a match
+ /// state is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// In particular, this routine is principally
+ /// useful when used in conjunction with the
+ /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::revers
+ /// e) configuration. In general, it's unlikely to be correct to use both
+ /// `find_leftmost_fwd` and `find_leftmost_rev` with the same DFA since
+ /// any particular DFA will only support searching in one direction with
+ /// respect to the pattern.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson, hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("foo[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(
+ /// Some(expected),
+ /// dfa.find_leftmost_rev(&mut cache, b"foo12345")?,
+ /// );
+ ///
+ /// // Even though a match is found after reading the last byte (`c`),
+ /// // the leftmost first match semantics demand that we find the earliest
+ /// // match that prefers earlier parts of the pattern over latter parts.
+ /// let dfa = DFA::builder()
+ /// .thompson(thompson::Config::new().reverse(true))
+ /// .build("abc|c")?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = HalfMatch::must(0, 0);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_rev(&mut cache, b"abc")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_leftmost_rev(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_leftmost_rev_at(cache, None, bytes, 0, bytes.len())
+ }
+
+ /// Executes an overlapping forward search and returns the end position of
+ /// matches as they are found. If no match exists, then `None` is returned.
+ ///
+ /// This routine is principally only useful when searching for multiple
+ /// patterns on inputs where multiple patterns may match the same regions
+ /// of text. In particular, callers must preserve the automaton's search
+ /// state from prior calls so that the implementation knows where the last
+ /// match occurred.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run a basic overlapping search. Notice
+ /// that we build the automaton with a `MatchKind::All` configuration.
+ /// Overlapping searches are unlikely to work as one would expect when
+ /// using the default `MatchKind::LeftmostFirst` match semantics, since
+ /// leftmost-first matching is fundamentally incompatible with overlapping
+ /// searches. Namely, overlapping searches need to report matches as they
+ /// are seen, where as leftmost-first searches will continue searching even
+ /// after a match has been observed in order to find the conventional end
+ /// position of the match. More concretely, leftmost-first searches use
+ /// dead states to terminate a search after a specific match can no longer
+ /// be extended. Overlapping searches instead do the opposite by continuing
+ /// the search to find totally new matches (potentially of other patterns).
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, OverlappingState},
+ /// HalfMatch,
+ /// MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(HalfMatch::must(1, 4));
+ /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_overlapping_fwd(
+ &self,
+ cache: &mut Cache,
+ bytes: &[u8],
+ state: &mut OverlappingState,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ self.find_overlapping_fwd_at(
+ cache,
+ None,
+ None,
+ bytes,
+ 0,
+ bytes.len(),
+ state,
+ )
+ }
+
+ /// Executes a forward search and returns the end position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state. This is useful for implementing boolean `is_match`-like
+ /// routines, where as little work is done as possible.
+ ///
+ /// This is like [`DFA::find_earliest_fwd`], except it provides some
+ /// additional control over how the search is executed:
+ ///
+ /// * `pre` is a prefilter scanner that, when given, is used whenever the
+ /// DFA enters its starting state. This is meant to speed up searches where
+ /// one or a small number of literal prefixes are known.
+ /// * `pattern_id` specifies a specific pattern in the DFA to run an
+ /// anchored search for. If not given, then a search for any pattern is
+ /// performed. For lazy DFAs, [`Config::starts_for_each_pattern`] must be
+ /// enabled to use this functionality.
+ /// * `start` and `end` permit searching a specific region of the haystack
+ /// `bytes`. This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `bytes`. (Because the existence of look-around
+ /// operations such as `\b`, `^` and `$` need to take the surrounding
+ /// context into account. This cannot be done if the haystack doesn't
+ /// contain it.)
+ ///
+ /// The examples below demonstrate each of these additional parameters.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and this lazy DFA does
+ /// not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ ///
+ /// # Example: prefilter
+ ///
+ /// This example shows how to provide a prefilter for a pattern where all
+ /// matches start with a `z` byte.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// util::prefilter::{Candidate, Prefilter, Scanner, State},
+ /// HalfMatch,
+ /// };
+ ///
+ /// #[derive(Debug)]
+ /// pub struct ZPrefilter;
+ ///
+ /// impl Prefilter for ZPrefilter {
+ /// fn next_candidate(
+ /// &self,
+ /// _: &mut State,
+ /// haystack: &[u8],
+ /// at: usize,
+ /// ) -> Candidate {
+ /// // Try changing b'z' to b'q' and observe this test fail since
+ /// // the prefilter will skip right over the match.
+ /// match haystack.iter().position(|&b| b == b'z') {
+ /// None => Candidate::None,
+ /// Some(i) => Candidate::PossibleStartOfMatch(at + i),
+ /// }
+ /// }
+ ///
+ /// fn heap_bytes(&self) -> usize {
+ /// 0
+ /// }
+ /// }
+ ///
+ /// let dfa = DFA::new("z[0-9]{3}")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "foobar z123 q123".as_bytes();
+ /// // A scanner executes a prefilter while tracking some state that helps
+ /// // determine whether a prefilter is still "effective" or not.
+ /// let mut scanner = Scanner::new(&ZPrefilter);
+ ///
+ /// let expected = Some(HalfMatch::must(0, 11));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// Some(&mut scanner),
+ /// None,
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specific pattern search
+ ///
+ /// This example shows how to build a lazy multi-DFA that permits searching
+ /// for specific patterns.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// HalfMatch,
+ /// PatternID,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "foo123".as_bytes();
+ ///
+ /// // Since we are using the default leftmost-first match and both
+ /// // patterns match at the same starting position, only the first pattern
+ /// // will be returned in this case when doing a search for any of the
+ /// // patterns.
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// None,
+ /// None,
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we want to check whether some other pattern matches, then we
+ /// // can provide its pattern ID.
+ /// let expected = Some(HalfMatch::must(1, 6));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// None,
+ /// Some(PatternID::must(1)),
+ /// haystack,
+ /// 0,
+ /// haystack.len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: specifying the bounds of a search
+ ///
+ /// This example shows how providing the bounds of a search can produce
+ /// different results than simply sub-slicing the haystack.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// // N.B. We disable Unicode here so that we use a simple ASCII word
+ /// // boundary. Alternatively, we could enable heuristic support for
+ /// // Unicode word boundaries since our haystack is pure ASCII.
+ /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // Since we sub-slice the haystack, the search doesn't know about the
+ /// // larger context and assumes that `123` is surrounded by word
+ /// // boundaries. And of course, the match position is reported relative
+ /// // to the sub-slice as well, which means we get `3` instead of `6`.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// None,
+ /// None,
+ /// &haystack[3..6],
+ /// 0,
+ /// haystack[3..6].len(),
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // But if we provide the bounds of the search within the context of the
+ /// // entire haystack, then the search can take the surrounding context
+ /// // into account. (And if we did find a match, it would be reported
+ /// // as a valid offset into `haystack` instead of its sub-slice.)
+ /// let expected = None;
+ /// let got = dfa.find_earliest_fwd_at(
+ /// &mut cache,
+ /// None,
+ /// None,
+ /// haystack,
+ /// 3,
+ /// 6,
+ /// )?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn find_earliest_fwd_at(
+ &self,
+ cache: &mut Cache,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_earliest_fwd(
+ pre, self, cache, pattern_id, bytes, start, end,
+ )
+ }
+
+ /// Executes a reverse search and returns the start position of the first
+ /// match that is found as early as possible. If no match exists, then
+ /// `None` is returned.
+ ///
+ /// This routine stops scanning input as soon as the search observes a
+ /// match state.
+ ///
+ /// This is like [`DFA::find_earliest_rev`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`DFA::find_earliest_fwd_at`] for more details
+ /// on the additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ #[inline]
+ pub fn find_earliest_rev_at(
+ &self,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_earliest_rev(self, cache, pattern_id, bytes, start, end)
+ }
+
+ /// Executes a forward search and returns the end position of the leftmost
+ /// match that is found. If no match exists, then `None` is returned.
+ ///
+ /// This is like [`DFA::find_leftmost_fwd`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the
+ /// additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ #[inline]
+ pub fn find_leftmost_fwd_at(
+ &self,
+ cache: &mut Cache,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_leftmost_fwd(
+ pre, self, cache, pattern_id, bytes, start, end,
+ )
+ }
+
+ /// Executes a reverse search and returns the start of the position of the
+ /// leftmost match that is found. If no match exists, then `None` is
+ /// returned.
+ ///
+ /// This is like [`DFA::find_leftmost_rev`], except it provides some
+ /// additional control over how the search is executed. See the
+ /// documentation of [`DFA::find_earliest_fwd_at`] for more details on the
+ /// additional parameters along with examples of their usage.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ #[inline]
+ pub fn find_leftmost_rev_at(
+ &self,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_leftmost_rev(self, cache, pattern_id, bytes, start, end)
+ }
+
+ /// Executes an overlapping forward search and returns the end position of
+ /// matches as they are found. If no match exists, then `None` is returned.
+ ///
+ /// This routine is principally only useful when searching for multiple
+ /// patterns on inputs where multiple patterns may match the same regions
+ /// of text. In particular, callers must preserve the automaton's search
+ /// state from prior calls so that the implementation knows where the last
+ /// match occurred.
+ ///
+ /// This is like [`DFA::find_overlapping_fwd`], except it provides
+ /// some additional control over how the search is executed. See the
+ /// documentation of [`DFA::find_earliest_fwd_at`] for more details
+ /// on the additional parameters along with examples of their usage.
+ ///
+ /// When using this routine to implement an iterator of overlapping
+ /// matches, the `start` of the search should always be set to the end
+ /// of the last match. If more patterns match at the previous location,
+ /// then they will be immediately returned. (This is tracked by the given
+ /// overlapping state.) Otherwise, the search continues at the starting
+ /// position given.
+ ///
+ /// If for some reason you want the search to forget about its previous
+ /// state and restart the search at a particular position, then setting the
+ /// state to [`OverlappingState::start`] will accomplish that.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// lazy DFAs generated by this crate, this only occurs in non-default
+ /// configurations where quit bytes are used, Unicode word boundaries are
+ /// heuristically enabled or limits are set on the number of times the lazy
+ /// DFA's cache may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// # Panics
+ ///
+ /// This routine panics if a `pattern_id` is given and the underlying
+ /// DFA does not support specific pattern searches.
+ ///
+ /// It also panics if the given haystack range is not valid.
+ #[inline]
+ pub fn find_overlapping_fwd_at(
+ &self,
+ cache: &mut Cache,
+ pre: Option<&mut prefilter::Scanner>,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<HalfMatch>, MatchError> {
+ search::find_overlapping_fwd(
+ pre, self, cache, pattern_id, bytes, start, end, state,
+ )
+ }
+}
+
+impl DFA {
+ /// Transitions from the current state to the next state, given the next
+ /// byte of input.
+ ///
+ /// The given cache is used to either reuse pre-computed state
+ /// transitions, or to store this newly computed transition for future
+ /// reuse. Thus, this routine guarantees that it will never return a state
+ /// ID that has an "unknown" tag.
+ ///
+ /// # State identifier validity
+ ///
+ /// The only valid value for `current` is the lazy state ID returned
+ /// by the most recent call to `next_state`, `next_state_untagged`,
+ /// `next_state_untagged_unchecked`, `start_state_forward` or
+ /// `state_state_reverse` for the given `cache`. Any state ID returned from
+ /// prior calls to these routines (with the same `cache`) is considered
+ /// invalid (even if it gives an appearance of working). State IDs returned
+ /// from _any_ prior call for different `cache` values are also always
+ /// invalid.
+ ///
+ /// The returned ID is always a valid ID when `current` refers to a valid
+ /// ID. Moreover, this routine is defined for all possible values of
+ /// `input`.
+ ///
+ /// These validity rules are not checked, even in debug mode. Callers are
+ /// required to uphold these rules themselves.
+ ///
+ /// Violating these state ID validity rules will not sacrifice memory
+ /// safety, but _may_ produce an incorrect result or a panic.
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid or
+ /// incorrect ID.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a lazy DFA for a given
+ /// haystack by using the `next_state` method.
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::new(r"[a-z]+r")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut sid = dfa.start_state_forward(
+ /// &mut cache, None, haystack, 0, haystack.len(),
+ /// )?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// sid = dfa.next_state(&mut cache, sid, b)?;
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+ /// // special "EOI" transition at the end of the search.
+ /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+ /// assert!(sid.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn next_state(
+ &self,
+ cache: &mut Cache,
+ current: LazyStateID,
+ input: u8,
+ ) -> Result<LazyStateID, CacheError> {
+ let class = usize::from(self.classes.get(input));
+ let offset = current.as_usize_untagged() + class;
+ let sid = cache.trans[offset];
+ if !sid.is_unknown() {
+ return Ok(sid);
+ }
+ let unit = alphabet::Unit::u8(input);
+ Lazy::new(self, cache).cache_next_state(current, unit)
+ }
+
+ /// Transitions from the current state to the next state, given the next
+ /// byte of input and a state ID that is not tagged.
+ ///
+ /// The only reason to use this routine is performance. In particular, the
+ /// `next_state` method needs to do some additional checks, among them is
+ /// to account for identifiers to states that are not yet computed. In
+ /// such a case, the transition is computed on the fly. However, if it is
+ /// known that the `current` state ID is untagged, then these checks can be
+ /// omitted.
+ ///
+ /// Since this routine does not compute states on the fly, it does not
+ /// modify the cache and thus cannot return an error. Consequently, `cache`
+ /// does not need to be mutable and it is possible for this routine to
+ /// return a state ID corresponding to the special "unknown" state. In
+ /// this case, it is the caller's responsibility to use the prior state
+ /// ID and `input` with `next_state` in order to force the computation of
+ /// the unknown transition. Otherwise, trying to use the "unknown" state
+ /// ID will just result in transitioning back to itself, and thus never
+ /// terminating. (This is technically a special exemption to the state ID
+ /// validity rules, but is permissible since this routine is guarateed to
+ /// never mutate the given `cache`, and thus the identifier is guaranteed
+ /// to remain valid.)
+ ///
+ /// See [`LazyStateID`] for more details on what it means for a state ID
+ /// to be tagged. Also, see
+ /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
+ /// for this same idea, but with bounds checks forcefully elided.
+ ///
+ /// # State identifier validity
+ ///
+ /// The only valid value for `current` is an **untagged** lazy
+ /// state ID returned by the most recent call to `next_state`,
+ /// `next_state_untagged`, `next_state_untagged_unchecked`,
+ /// `start_state_forward` or `state_state_reverse` for the given `cache`.
+ /// Any state ID returned from prior calls to these routines (with the
+ /// same `cache`) is considered invalid (even if it gives an appearance
+ /// of working). State IDs returned from _any_ prior call for different
+ /// `cache` values are also always invalid.
+ ///
+ /// The returned ID is always a valid ID when `current` refers to a valid
+ /// ID, although it may be tagged. Moreover, this routine is defined for
+ /// all possible values of `input`.
+ ///
+ /// Not all validity rules are checked, even in debug mode. Callers are
+ /// required to uphold these rules themselves.
+ ///
+ /// Violating these state ID validity rules will not sacrifice memory
+ /// safety, but _may_ produce an incorrect result or a panic.
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid or
+ /// incorrect ID.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a lazy DFA for a given
+ /// haystack by using the `next_state_untagged` method where possible.
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::new(r"[a-z]+r")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut sid = dfa.start_state_forward(
+ /// &mut cache, None, haystack, 0, haystack.len(),
+ /// )?;
+ /// // Walk all the bytes in the haystack.
+ /// let mut at = 0;
+ /// while at < haystack.len() {
+ /// if sid.is_tagged() {
+ /// sid = dfa.next_state(&mut cache, sid, haystack[at])?;
+ /// } else {
+ /// let mut prev_sid = sid;
+ /// // We attempt to chew through as much as we can while moving
+ /// // through untagged state IDs. Thus, the transition function
+ /// // does less work on average per byte. (Unrolling this loop
+ /// // may help even more.)
+ /// while at < haystack.len() {
+ /// prev_sid = sid;
+ /// sid = dfa.next_state_untagged(
+ /// &mut cache, sid, haystack[at],
+ /// );
+ /// at += 1;
+ /// if sid.is_tagged() {
+ /// break;
+ /// }
+ /// }
+ /// // We must ensure that we never proceed to the next iteration
+ /// // with an unknown state ID. If we don't account for this
+ /// // case, then search isn't guaranteed to terminate since all
+ /// // transitions on unknown states loop back to itself.
+ /// if sid.is_unknown() {
+ /// sid = dfa.next_state(
+ /// &mut cache, prev_sid, haystack[at - 1],
+ /// )?;
+ /// }
+ /// }
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk the
+ /// // special "EOI" transition at the end of the search.
+ /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+ /// assert!(sid.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn next_state_untagged(
+ &self,
+ cache: &Cache,
+ current: LazyStateID,
+ input: u8,
+ ) -> LazyStateID {
+ debug_assert!(!current.is_tagged());
+ let class = usize::from(self.classes.get(input));
+ let offset = current.as_usize_unchecked() + class;
+ cache.trans[offset]
+ }
+
+ /// Transitions from the current state to the next state, eliding bounds
+ /// checks, given the next byte of input and a state ID that is not tagged.
+ ///
+ /// The only reason to use this routine is performance. In particular, the
+ /// `next_state` method needs to do some additional checks, among them is
+ /// to account for identifiers to states that are not yet computed. In
+ /// such a case, the transition is computed on the fly. However, if it is
+ /// known that the `current` state ID is untagged, then these checks can be
+ /// omitted.
+ ///
+ /// Since this routine does not compute states on the fly, it does not
+ /// modify the cache and thus cannot return an error. Consequently, `cache`
+ /// does not need to be mutable and it is possible for this routine to
+ /// return a state ID corresponding to the special "unknown" state. In
+ /// this case, it is the caller's responsibility to use the prior state
+ /// ID and `input` with `next_state` in order to force the computation of
+ /// the unknown transition. Otherwise, trying to use the "unknown" state
+ /// ID will just result in transitioning back to itself, and thus never
+ /// terminating. (This is technically a special exemption to the state ID
+ /// validity rules, but is permissible since this routine is guarateed to
+ /// never mutate the given `cache`, and thus the identifier is guaranteed
+ /// to remain valid.)
+ ///
+ /// See [`LazyStateID`] for more details on what it means for a state ID
+ /// to be tagged. Also, see
+ /// [`next_state_untagged`](DFA::next_state_untagged)
+ /// for this same idea, but with memory safety guaranteed by retaining
+ /// bounds checks.
+ ///
+ /// # State identifier validity
+ ///
+ /// The only valid value for `current` is an **untagged** lazy
+ /// state ID returned by the most recent call to `next_state`,
+ /// `next_state_untagged`, `next_state_untagged_unchecked`,
+ /// `start_state_forward` or `state_state_reverse` for the given `cache`.
+ /// Any state ID returned from prior calls to these routines (with the
+ /// same `cache`) is considered invalid (even if it gives an appearance
+ /// of working). State IDs returned from _any_ prior call for different
+ /// `cache` values are also always invalid.
+ ///
+ /// The returned ID is always a valid ID when `current` refers to a valid
+ /// ID, although it may be tagged. Moreover, this routine is defined for
+ /// all possible values of `input`.
+ ///
+ /// Not all validity rules are checked, even in debug mode. Callers are
+ /// required to uphold these rules themselves.
+ ///
+ /// Violating these state ID validity rules will not sacrifice memory
+ /// safety, but _may_ produce an incorrect result or a panic.
+ ///
+ /// # Safety
+ ///
+ /// Callers of this method must guarantee that `current` refers to a valid
+ /// state ID according to the rules described above. If `current` is not a
+ /// valid state ID for this automaton, then calling this routine may result
+ /// in undefined behavior.
+ ///
+ /// If `current` is valid, then the ID returned is valid for all possible
+ /// values of `input`.
+ #[inline]
+ pub unsafe fn next_state_untagged_unchecked(
+ &self,
+ cache: &Cache,
+ current: LazyStateID,
+ input: u8,
+ ) -> LazyStateID {
+ debug_assert!(!current.is_tagged());
+ let class = usize::from(self.classes.get(input));
+ let offset = current.as_usize_unchecked() + class;
+ *cache.trans.get_unchecked(offset)
+ }
+
+ /// Transitions from the current state to the next state for the special
+ /// EOI symbol.
+ ///
+ /// The given cache is used to either reuse pre-computed state
+ /// transitions, or to store this newly computed transition for future
+ /// reuse. Thus, this routine guarantees that it will never return a state
+ /// ID that has an "unknown" tag.
+ ///
+ /// This routine must be called at the end of every search in a correct
+ /// implementation of search. Namely, lazy DFAs in this crate delay matches
+ /// by one byte in order to support look-around operators. Thus, after
+ /// reaching the end of a haystack, a search implementation must follow one
+ /// last EOI transition.
+ ///
+ /// It is best to think of EOI as an additional symbol in the alphabet of a
+ /// DFA that is distinct from every other symbol. That is, the alphabet of
+ /// lazy DFAs in this crate has a logical size of 257 instead of 256, where
+ /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
+ /// physical alphabet size may be smaller because of alphabet compression
+ /// via equivalence classes, but EOI is always represented somehow in the
+ /// alphabet.)
+ ///
+ /// # State identifier validity
+ ///
+ /// The only valid value for `current` is the lazy state ID returned
+ /// by the most recent call to `next_state`, `next_state_untagged`,
+ /// `next_state_untagged_unchecked`, `start_state_forward` or
+ /// `state_state_reverse` for the given `cache`. Any state ID returned from
+ /// prior calls to these routines (with the same `cache`) is considered
+ /// invalid (even if it gives an appearance of working). State IDs returned
+ /// from _any_ prior call for different `cache` values are also always
+ /// invalid.
+ ///
+ /// The returned ID is always a valid ID when `current` refers to a valid
+ /// ID.
+ ///
+ /// These validity rules are not checked, even in debug mode. Callers are
+ /// required to uphold these rules themselves.
+ ///
+ /// Violating these state ID validity rules will not sacrifice memory
+ /// safety, but _may_ produce an incorrect result or a panic.
+ ///
+ /// # Panics
+ ///
+ /// If the given ID does not refer to a valid state, then this routine
+ /// may panic but it also may not panic and instead return an invalid or
+ /// incorrect ID.
+ ///
+ /// # Example
+ ///
+ /// This shows a simplistic example for walking a DFA for a given haystack,
+ /// and then finishing the search with the final EOI transition.
+ ///
+ /// ```
+ /// use regex_automata::hybrid::dfa::DFA;
+ ///
+ /// let dfa = DFA::new(r"[a-z]+r")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut sid = dfa.start_state_forward(
+ /// &mut cache, None, haystack, 0, haystack.len(),
+ /// )?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// sid = dfa.next_state(&mut cache, sid, b)?;
+ /// }
+ /// // Matches are always delayed by 1 byte, so we must explicitly walk
+ /// // the special "EOI" transition at the end of the search. Without this
+ /// // final transition, the assert below will fail since the DFA will not
+ /// // have entered a match state yet!
+ /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+ /// assert!(sid.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn next_eoi_state(
+ &self,
+ cache: &mut Cache,
+ current: LazyStateID,
+ ) -> Result<LazyStateID, CacheError> {
+ let eoi = self.classes.eoi().as_usize();
+ let offset = current.as_usize_untagged() + eoi;
+ let sid = cache.trans[offset];
+ if !sid.is_unknown() {
+ return Ok(sid);
+ }
+ let unit = self.classes.eoi();
+ Lazy::new(self, cache).cache_next_state(current, unit)
+ }
+
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// forward search.
+ ///
+ /// Unlike typical DFA implementations, the start state for DFAs in this
+ /// crate is dependent on a few different factors:
+ ///
+ /// * The pattern ID, if present. When the underlying DFA has been
+ /// configured with multiple patterns _and_ the DFA has been configured to
+ /// build an anchored start state for each pattern, then a pattern ID may
+ /// be specified to execute an anchored search for that specific pattern.
+ /// If `pattern_id` is invalid or if the DFA isn't configured to build
+ /// start states for each pattern, then implementations must panic. DFAs in
+ /// this crate can be configured to build start states for each pattern via
+ /// [`Config::starts_for_each_pattern`].
+ /// * When `start > 0`, the byte at index `start - 1` may influence the
+ /// start state if the regex uses `^` or `\b`.
+ /// * Similarly, when `start == 0`, it may influence the start state when
+ /// the regex uses `^` or `\A`.
+ /// * Currently, `end` is unused.
+ /// * Whether the search is a forward or reverse search. This routine can
+ /// only be used for forward searches.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `start..end` is not a valid sub-slice of `bytes`. This
+ /// also panics if `pattern_id` is non-None and does not refer to a valid
+ /// pattern, or if the DFA was not configured to build anchored start
+ /// states for each pattern.
+ #[inline]
+ pub fn start_state_forward(
+ &self,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<LazyStateID, CacheError> {
+ let mut lazy = Lazy::new(self, cache);
+ let start_type = Start::from_position_fwd(bytes, start, end);
+ let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type);
+ if !sid.is_unknown() {
+ return Ok(sid);
+ }
+ lazy.cache_start_group(pattern_id, start_type)
+ }
+
+ /// Return the ID of the start state for this lazy DFA when executing a
+ /// reverse search.
+ ///
+ /// Unlike typical DFA implementations, the start state for DFAs in this
+ /// crate is dependent on a few different factors:
+ ///
+ /// * The pattern ID, if present. When the underlying DFA has been
+ /// configured with multiple patterns _and_ the DFA has been configured to
+ /// build an anchored start state for each pattern, then a pattern ID may
+ /// be specified to execute an anchored search for that specific pattern.
+ /// If `pattern_id` is invalid or if the DFA isn't configured to build
+ /// start states for each pattern, then implementations must panic. DFAs in
+ /// this crate can be configured to build start states for each pattern via
+ /// [`Config::starts_for_each_pattern`].
+ /// * When `end < bytes.len()`, the byte at index `end` may influence the
+ /// start state if the regex uses `$` or `\b`.
+ /// * Similarly, when `end == bytes.len()`, it may influence the start
+ /// state when the regex uses `$` or `\z`.
+ /// * Currently, `start` is unused.
+ /// * Whether the search is a forward or reverse search. This routine can
+ /// only be used for reverse searches.
+ ///
+ /// # Panics
+ ///
+ /// This panics if `start..end` is not a valid sub-slice of `bytes`. This
+ /// also panics if `pattern_id` is non-None and does not refer to a valid
+ /// pattern, or if the DFA was not configured to build anchored start
+ /// states for each pattern.
+ #[inline]
+ pub fn start_state_reverse(
+ &self,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<LazyStateID, CacheError> {
+ let mut lazy = Lazy::new(self, cache);
+ let start_type = Start::from_position_rev(bytes, start, end);
+ let sid = lazy.as_ref().get_cached_start_id(pattern_id, start_type);
+ if !sid.is_unknown() {
+ return Ok(sid);
+ }
+ lazy.cache_start_group(pattern_id, start_type)
+ }
+
+ /// Returns the total number of patterns that match in this state.
+ ///
+ /// If the lazy DFA was compiled with one pattern, then this must
+ /// necessarily always return `1` for all match states.
+ ///
+ /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with
+ /// indices up to (but not including) the count returned by this routine
+ /// without panicking.
+ ///
+ /// If the given state is not a match state, then this may either panic
+ /// or return an incorrect result.
+ ///
+ /// # Example
+ ///
+ /// This example shows a simple instance of implementing overlapping
+ /// matches. In particular, it shows not only how to determine how many
+ /// patterns have matched in a particular state, but also how to access
+ /// which specific patterns have matched.
+ ///
+ /// Notice that we must use [`MatchKind::All`](crate::MatchKind::All)
+ /// when building the DFA. If we used
+ /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst)
+ /// instead, then the DFA would not be constructed in a way that supports
+ /// overlapping matches. (It would only report a single pattern that
+ /// matches at any particular point in time.)
+ ///
+ /// Another thing to take note of is the patterns used and the order in
+ /// which the pattern IDs are reported. In the example below, pattern `3`
+ /// is yielded first. Why? Because it corresponds to the match that
+ /// appears first. Namely, the `@` symbol is part of `\S+` but not part
+ /// of any of the other patterns. Since the `\S+` pattern has a match that
+ /// starts to the left of any other pattern, its ID is returned before any
+ /// other.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, MatchKind};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[
+ /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+",
+ /// ])?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "@bar".as_bytes();
+ ///
+ /// // The start state is determined by inspecting the position and the
+ /// // initial bytes of the haystack.
+ /// let mut sid = dfa.start_state_forward(
+ /// &mut cache, None, haystack, 0, haystack.len(),
+ /// )?;
+ /// // Walk all the bytes in the haystack.
+ /// for &b in haystack {
+ /// sid = dfa.next_state(&mut cache, sid, b)?;
+ /// }
+ /// sid = dfa.next_eoi_state(&mut cache, sid)?;
+ ///
+ /// assert!(sid.is_match());
+ /// assert_eq!(dfa.match_count(&mut cache, sid), 3);
+ /// // The following calls are guaranteed to not panic since `match_count`
+ /// // returned `3` above.
+ /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3);
+ /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0);
+ /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn match_count(&self, cache: &Cache, id: LazyStateID) -> usize {
+ assert!(id.is_match());
+ LazyRef::new(self, cache).get_cached_state(id).match_count()
+ }
+
+ /// Returns the pattern ID corresponding to the given match index in the
+ /// given state.
+ ///
+ /// See [`DFA::match_count`] for an example of how to use this method
+ /// correctly. Note that if you know your lazy DFA is configured with a
+ /// single pattern, then this routine is never necessary since it will
+ /// always return a pattern ID of `0` for an index of `0` when `id`
+ /// corresponds to a match state.
+ ///
+ /// Typically, this routine is used when implementing an overlapping
+ /// search, as the example for `DFA::match_count` does.
+ ///
+ /// # Panics
+ ///
+ /// If the state ID is not a match state or if the match index is out
+ /// of bounds for the given state, then this routine may either panic
+ /// or produce an incorrect result. If the state ID is correct and the
+ /// match index is correct, then this routine always produces a valid
+ /// `PatternID`.
+ #[inline]
+ pub fn match_pattern(
+ &self,
+ cache: &Cache,
+ id: LazyStateID,
+ match_index: usize,
+ ) -> PatternID {
+ // This is an optimization for the very common case of a DFA with a
+ // single pattern. This conditional avoids a somewhat more costly path
+ // that finds the pattern ID from the corresponding `State`, which
+ // requires a bit of slicing/pointer-chasing. This optimization tends
+ // to only matter when matches are frequent.
+ if self.pattern_count() == 1 {
+ return PatternID::ZERO;
+ }
+ LazyRef::new(self, cache)
+ .get_cached_state(id)
+ .match_pattern(match_index)
+ }
+}
+
+/// A cache represents a partially computed DFA.
+///
+/// A cache is the key component that differentiates a classical DFA and a
+/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
+/// complete transition table that can handle all possible inputs, a hybrid
+/// NFA/DFA starts with an empty transition table and builds only the parts
+/// required during search. The parts that are built are stored in a cache. For
+/// this reason, a cache is a required parameter for nearly every operation on
+/// a [`DFA`].
+///
+/// Caches can be created from their corresponding DFA via
+/// [`DFA::create_cache`]. A cache can only be used with either the DFA that
+/// created it, or the DFA that was most recently used to reset it with
+/// [`Cache::reset`]. Using a cache with any other DFA may result in panics
+/// or incorrect results.
+#[derive(Clone, Debug)]
+pub struct Cache {
+ // N.B. If you're looking to understand how determinization works, it
+ // is probably simpler to first grok src/dfa/determinize.rs, since that
+ // doesn't have the "laziness" component.
+ /// The transition table.
+ ///
+ /// Given a `current` LazyStateID and an `input` byte, the next state can
+ /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice
+ /// that no multiplication is used. That's because state identifiers are
+ /// "premultiplied."
+ ///
+ /// Note that the next state may be the "unknown" state. In this case, the
+ /// next state is not known and determinization for `current` on `input`
+ /// must be performed.
+ trans: Vec<LazyStateID>,
+ /// The starting states for this DFA.
+ ///
+ /// These are computed lazily. Initially, these are all set to "unknown"
+ /// lazy state IDs.
+ ///
+ /// When 'starts_for_each_pattern' is disabled (the default), then the size
+ /// of this is constrained to the possible starting configurations based
+ /// on the search parameters. (At time of writing, that's 4.) However,
+ /// when starting states for each pattern is enabled, then there are N
+ /// additional groups of starting states, where each group reflects the
+ /// different possible configurations and N is the number of patterns.
+ starts: Vec<LazyStateID>,
+ /// A sequence of NFA/DFA powerset states that have been computed for this
+ /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every
+ /// tagged LazyStateID can be used to index this sequence by converting it
+ /// to its untagged form.)
+ states: Vec<State>,
+ /// A map from states to their corresponding IDs. This map may be accessed
+ /// via the raw byte representation of a state, which means that a `State`
+ /// does not need to be allocated to determine whether it already exists
+ /// in this map. Indeed, the existence of such a state is what determines
+ /// whether we allocate a new `State` or not.
+ ///
+ /// The higher level idea here is that we do just enough determinization
+ /// for a state to check whether we've already computed it. If we have,
+ /// then we can save a little (albeit not much) work. The real savings is
+ /// in memory usage. If we never checked for trivially duplicate states,
+ /// then our memory usage would explode to unreasonable levels.
+ states_to_id: StateMap,
+ /// Sparse sets used to track which NFA states have been visited during
+ /// various traversals.
+ sparses: SparseSets,
+ /// Scratch space for traversing the NFA graph. (We use space on the heap
+ /// instead of the call stack.)
+ stack: Vec<NFAStateID>,
+ /// Scratch space for building a NFA/DFA powerset state. This is used to
+ /// help amortize allocation since not every powerset state generated is
+ /// added to the cache. In particular, if it already exists in the cache,
+ /// then there is no need to allocate a new `State` for it.
+ scratch_state_builder: StateBuilderEmpty,
+ /// A simple abstraction for handling the saving of at most a single state
+ /// across a cache clearing. This is required for correctness. Namely, if
+ /// adding a new state after clearing the cache fails, then the caller
+ /// must retain the ability to continue using the state ID given. The
+ /// state corresponding to the state ID is what we preserve across cache
+ /// clearings.
+ state_saver: StateSaver,
+ /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We
+ /// track this as new states are added since states use a variable amount
+ /// of heap. Tracking this as we add states makes it possible to compute
+ /// the total amount of memory used by the determinizer in constant time.
+ memory_usage_state: usize,
+ /// The number of times the cache has been cleared. When a minimum cache
+ /// clear count is set, then the cache will return an error instead of
+ /// clearing the cache if the count has been exceeded.
+ clear_count: usize,
+}
+
+impl Cache {
+ /// Create a new cache for the given lazy DFA.
+ ///
+ /// The cache returned should only be used for searches for the given DFA.
+ /// If you want to reuse the cache for another DFA, then you must call
+ /// [`Cache::reset`] with that DFA.
+ pub fn new(dfa: &DFA) -> Cache {
+ let mut cache = Cache {
+ trans: alloc::vec![],
+ starts: alloc::vec![],
+ states: alloc::vec![],
+ states_to_id: StateMap::new(),
+ sparses: SparseSets::new(dfa.nfa.len()),
+ stack: alloc::vec![],
+ scratch_state_builder: StateBuilderEmpty::new(),
+ state_saver: StateSaver::none(),
+ memory_usage_state: 0,
+ clear_count: 0,
+ };
+ Lazy { dfa, cache: &mut cache }.init_cache();
+ cache
+ }
+
+ /// Reset this cache such that it can be used for searching with the given
+ /// lazy DFA (and only that DFA).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different lazy DFA.
+ ///
+ /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+ /// lazy DFA has been configured to "give up" after it has cleared the
+ /// cache a certain number of times.
+ ///
+ /// Any lazy state ID generated by the cache prior to resetting it is
+ /// invalid after the reset.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different DFA.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let dfa1 = DFA::new(r"\w")?;
+ /// let dfa2 = DFA::new(r"\W")?;
+ ///
+ /// let mut cache = dfa1.create_cache();
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 2)),
+ /// dfa1.find_leftmost_fwd(&mut cache, "Δ".as_bytes())?,
+ /// );
+ ///
+ /// // Using 'cache' with dfa2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the DFA we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 'dfa1' is also not
+ /// // allowed.
+ /// cache.reset(&dfa2);
+ /// assert_eq!(
+ /// Some(HalfMatch::must(0, 3)),
+ /// dfa2.find_leftmost_fwd(&mut cache, "☃".as_bytes())?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, dfa: &DFA) {
+ Lazy::new(dfa, self).reset_cache()
+ }
+
+ /// Returns the total number of times this cache has been cleared since it
+ /// was either created or last reset.
+ ///
+ /// This is useful for informational purposes or if you want to change
+ /// search strategies based on the number of times the cache has been
+ /// cleared.
+ pub fn clear_count(&self) -> usize {
+ self.clear_count
+ }
+
+ /// Returns the heap memory usage, in bytes, of this cache.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ const ID_SIZE: usize = size_of::<LazyStateID>();
+ const STATE_SIZE: usize = size_of::<State>();
+
+ self.trans.len() * ID_SIZE
+ + self.starts.len() * ID_SIZE
+ + self.states.len() * STATE_SIZE
+ // Maps likely use more memory than this, but it's probably close.
+ + self.states_to_id.len() * (STATE_SIZE + ID_SIZE)
+ + self.sparses.memory_usage()
+ + self.stack.capacity() * ID_SIZE
+ + self.scratch_state_builder.capacity()
+ // Heap memory used by 'State' in both 'states' and 'states_to_id'.
+ + self.memory_usage_state
+ }
+}
+
+/// A map from states to state identifiers. When using std, we use a standard
+/// hashmap, since it's a bit faster for this use case. (Other maps, like
+/// one's based on FNV, have not yet been benchmarked.)
+///
+/// The main purpose of this map is to reuse states where possible. This won't
+/// fully minimize the DFA, but it works well in a lot of cases.
+#[cfg(feature = "std")]
+type StateMap = std::collections::HashMap<State, LazyStateID>;
+#[cfg(not(feature = "std"))]
+type StateMap = alloc::collections::BTreeMap<State, LazyStateID>;
+
+/// A type that groups methods that require the base NFA/DFA and writable
+/// access to the cache.
+#[derive(Debug)]
+struct Lazy<'i, 'c> {
+ dfa: &'i DFA,
+ cache: &'c mut Cache,
+}
+
+impl<'i, 'c> Lazy<'i, 'c> {
+ /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
+ fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> {
+ Lazy { dfa, cache }
+ }
+
+ /// Return an immutable view by downgrading a writable cache to a read-only
+ /// cache.
+ fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> {
+ LazyRef::new(self.dfa, self.cache)
+ }
+
+ /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA'
+ /// like 'next_state' and 'next_eoi_state' that are called in critical
+ /// areas. The idea is to let the optimizer focus on the other areas of
+ /// those methods as the hot path.
+ ///
+ /// Here's an example that justifies 'inline(never)'
+ ///
+ /// ```ignore
+ /// regex-cli find hybrid dfa \
+ /// @all-codepoints-utf8-100x '\pL{100}' --cache-capacity 10000000
+ /// ```
+ ///
+ /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every
+ /// codepoint, in sequence, repeated 100 times.
+ ///
+ /// With 'inline(never)' hyperfine reports 1.1s per run. With
+ /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement.
+ #[inline(never)]
+ fn cache_next_state(
+ &mut self,
+ mut current: LazyStateID,
+ unit: alphabet::Unit,
+ ) -> Result<LazyStateID, CacheError> {
+ let stride2 = self.dfa.stride2();
+ let empty_builder = self.get_state_builder();
+ let builder = determinize::next(
+ &self.dfa.nfa,
+ self.dfa.match_kind,
+ &mut self.cache.sparses,
+ &mut self.cache.stack,
+ &self.cache.states[current.as_usize_untagged() >> stride2],
+ unit,
+ empty_builder,
+ );
+ let save_state = !self.as_ref().state_builder_fits_in_cache(&builder);
+ if save_state {
+ self.save_state(current);
+ }
+ let next = self.add_builder_state(builder, |sid| sid)?;
+ if save_state {
+ current = self.saved_state_id();
+ }
+ // This is the payoff. The next time 'next_state' is called with this
+ // state and alphabet unit, it will find this transition and avoid
+ // having to re-determinize this transition.
+ self.set_transition(current, unit, next);
+ Ok(next)
+ }
+
+ /// Compute and cache the starting state for the given pattern ID (if
+ /// present) and the starting configuration.
+ ///
+ /// This panics if a pattern ID is given and the DFA isn't configured to
+ /// build anchored start states for each pattern.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn cache_start_group(
+ &mut self,
+ pattern_id: Option<PatternID>,
+ start: Start,
+ ) -> Result<LazyStateID, CacheError> {
+ let nfa_start_id = match pattern_id {
+ Some(pid) => {
+ assert!(
+ self.dfa.starts_for_each_pattern,
+ "attempted to search for a specific pattern \
+ without enabling starts_for_each_pattern",
+ );
+ self.dfa.nfa.start_pattern(pid)
+ }
+ None if self.dfa.anchored => self.dfa.nfa.start_anchored(),
+ None => self.dfa.nfa.start_unanchored(),
+ };
+
+ let id = self.cache_start_one(nfa_start_id, start)?;
+ self.set_start_state(pattern_id, start, id);
+ Ok(id)
+ }
+
+ /// Compute and cache the starting state for the given NFA state ID and the
+ /// starting configuration. The NFA state ID might be one of the following:
+ ///
+ /// 1) An unanchored start state to match any pattern.
+ /// 2) An anchored start state to match any pattern.
+ /// 3) An anchored start state for a particular pattern.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn cache_start_one(
+ &mut self,
+ nfa_start_id: NFAStateID,
+ start: Start,
+ ) -> Result<LazyStateID, CacheError> {
+ let mut builder_matches = self.get_state_builder().into_matches();
+ determinize::set_lookbehind_from_start(&start, &mut builder_matches);
+ self.cache.sparses.set1.clear();
+ determinize::epsilon_closure(
+ self.dfa.nfa.borrow(),
+ nfa_start_id,
+ *builder_matches.look_have(),
+ &mut self.cache.stack,
+ &mut self.cache.sparses.set1,
+ );
+ let mut builder = builder_matches.into_nfa();
+ determinize::add_nfa_states(
+ self.dfa.nfa.borrow(),
+ &self.cache.sparses.set1,
+ &mut builder,
+ );
+ self.add_builder_state(builder, |id| id.to_start())
+ }
+
+ /// Either add the given builder state to this cache, or return an ID to an
+ /// equivalent state already in this cache.
+ ///
+ /// In the case where no equivalent state exists, the idmap function given
+ /// may be used to transform the identifier allocated. This is useful if
+ /// the caller needs to tag the ID with additional information.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn add_builder_state(
+ &mut self,
+ builder: StateBuilderNFA,
+ idmap: impl Fn(LazyStateID) -> LazyStateID,
+ ) -> Result<LazyStateID, CacheError> {
+ if let Some(&cached_id) =
+ self.cache.states_to_id.get(builder.as_bytes())
+ {
+ // Since we have a cached state, put the constructed state's
+ // memory back into our scratch space, so that it can be reused.
+ self.put_state_builder(builder);
+ return Ok(cached_id);
+ }
+ let result = self.add_state(builder.to_state(), idmap);
+ self.put_state_builder(builder);
+ result
+ }
+
+ /// Allocate a new state ID and add the given state to this cache.
+ ///
+ /// The idmap function given may be used to transform the identifier
+ /// allocated. This is useful if the caller needs to tag the ID with
+ /// additional information.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn add_state(
+ &mut self,
+ state: State,
+ idmap: impl Fn(LazyStateID) -> LazyStateID,
+ ) -> Result<LazyStateID, CacheError> {
+ if !self.as_ref().state_fits_in_cache(&state) {
+ self.try_clear_cache()?;
+ }
+ // It's important for this to come second, since the above may clear
+ // the cache. If we clear the cache after ID generation, then the ID
+ // is likely bunk since it would have been generated based on a larger
+ // transition table.
+ let mut id = idmap(self.next_state_id()?);
+ if state.is_match() {
+ id = id.to_match();
+ }
+ // Add room in the transition table. Since this is a fresh state, all
+ // of its transitions are unknown.
+ self.cache.trans.extend(
+ iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()),
+ );
+ // When we add a sentinel state, we never want to set any quit
+ // transitions. Technically, this is harmless, since sentinel states
+ // have all of their transitions set to loop back to themselves. But
+ // when creating sentinel states before the quit sentinel state,
+ // this will try to call 'set_transition' on a state ID that doesn't
+ // actually exist yet, which isn't allowed. So we just skip doing so
+ // entirely.
+ if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) {
+ let quit_id = self.as_ref().quit_id();
+ for b in self.dfa.quitset.iter() {
+ self.set_transition(id, alphabet::Unit::u8(b), quit_id);
+ }
+ }
+ self.cache.memory_usage_state += state.memory_usage();
+ self.cache.states.push(state.clone());
+ self.cache.states_to_id.insert(state, id);
+ Ok(id)
+ }
+
+ /// Allocate a new state ID.
+ ///
+ /// This will never return an unknown lazy state ID.
+ ///
+ /// If caching this state would otherwise result in a cache that has been
+ /// cleared too many times, then an error is returned.
+ fn next_state_id(&mut self) -> Result<LazyStateID, CacheError> {
+ let sid = match LazyStateID::new(self.cache.trans.len()) {
+ Ok(sid) => sid,
+ Err(_) => {
+ self.try_clear_cache()?;
+ // This has to pass since we check that ID capacity at
+ // construction time can fit at least MIN_STATES states.
+ LazyStateID::new(self.cache.trans.len()).unwrap()
+ }
+ };
+ Ok(sid)
+ }
+
+ /// Attempt to clear the cache used by this lazy DFA.
+ ///
+ /// If clearing the cache exceeds the minimum number of required cache
+ /// clearings, then this will return a cache error. In this case,
+ /// callers should bubble this up as the cache can't be used until it is
+ /// reset. Implementations of search should convert this error into a
+ /// `MatchError::GaveUp`.
+ ///
+ /// If 'self.state_saver' is set to save a state, then this state is
+ /// persisted through cache clearing. Otherwise, the cache is returned to
+ /// its state after initialization with two exceptions: its clear count
+ /// is incremented and some of its memory likely has additional capacity.
+ /// That is, clearing a cache does _not_ release memory.
+ ///
+ /// Otherwise, any lazy state ID generated by the cache prior to resetting
+ /// it is invalid after the reset.
+ fn try_clear_cache(&mut self) -> Result<(), CacheError> {
+ // Currently, the only heuristic we use is the minimum cache clear
+ // count. If we pass that minimum, then we give up.
+ //
+ // It would be good to also add a heuristic based on "bytes searched
+ // per generated state," but this requires API design work. Namely,
+ // we really do not want to add a counter increment to the transition
+ // function, which implies we need to expose APIs to update the number
+ // of bytes searched by implementers of the search routines. And that
+ // doesn't seem great... But we should do it if this heuristic isn't
+ // enough. (The original lazy DFA implementation in the 'regex' crate
+ // had this heuristic, since the lazy DFA was coupled with the search
+ // routines.)
+ if let Some(min_count) = self.dfa.minimum_cache_clear_count {
+ if self.cache.clear_count >= min_count {
+ return Err(CacheError::too_many_cache_clears());
+ }
+ }
+ self.clear_cache();
+ Ok(())
+ }
+
+ /// Clears _and_ resets the cache. Resetting the cache means that no
+ /// states are persisted and the clear count is reset to 0. No heap memory
+ /// is released.
+ ///
+ /// Note that the caller may reset a cache with a different DFA than what
+ /// it was created from. In which case, the cache can now be used with the
+ /// new DFA (and not the old DFA).
+ fn reset_cache(&mut self) {
+ self.cache.state_saver = StateSaver::none();
+ self.clear_cache();
+ // If a new DFA is used, it might have a different number of NFA
+ // states, so we need to make sure our sparse sets have the appropriate
+ // size.
+ self.cache.sparses.resize(self.dfa.nfa.len());
+ self.cache.clear_count = 0;
+ }
+
+ /// Clear the cache used by this lazy DFA.
+ ///
+ /// If clearing the cache exceeds the minimum number of required cache
+ /// clearings, then this will return a cache error. In this case,
+ /// callers should bubble this up as the cache can't be used until it is
+ /// reset. Implementations of search should convert this error into a
+ /// `MatchError::GaveUp`.
+ ///
+ /// If 'self.state_saver' is set to save a state, then this state is
+ /// persisted through cache clearing. Otherwise, the cache is returned to
+ /// its state after initialization with two exceptions: its clear count
+ /// is incremented and some of its memory likely has additional capacity.
+ /// That is, clearing a cache does _not_ release memory.
+ ///
+ /// Otherwise, any lazy state ID generated by the cache prior to resetting
+ /// it is invalid after the reset.
+ fn clear_cache(&mut self) {
+ self.cache.trans.clear();
+ self.cache.starts.clear();
+ self.cache.states.clear();
+ self.cache.states_to_id.clear();
+ self.cache.memory_usage_state = 0;
+ self.cache.clear_count += 1;
+ trace!(
+ "lazy DFA cache has been cleared (count: {})",
+ self.cache.clear_count
+ );
+ self.init_cache();
+ // If the state we want to save is one of the sentinel
+ // (unknown/dead/quit) states, then 'init_cache' adds those back, and
+ // their identifier values remains invariant. So there's no need to add
+ // it again. (And indeed, doing so would be incorrect!)
+ if let Some((old_id, state)) = self.cache.state_saver.take_to_save() {
+ // If the state is one of the special sentinel states, then it is
+ // automatically added by cache initialization and its ID always
+ // remains the same. With that said, this should never occur since
+ // the sentinel states are all loop states back to themselves. So
+ // we should never be in a position where we're attempting to save
+ // a sentinel state since we never compute transitions out of a
+ // sentinel state.
+ assert!(
+ !self.as_ref().is_sentinel(old_id),
+ "cannot save sentinel state"
+ );
+ let new_id = self
+ .add_state(state, |id| {
+ if old_id.is_start() {
+ id.to_start()
+ } else {
+ id
+ }
+ })
+ // The unwrap here is OK because lazy DFA creation ensures that
+ // we have room in the cache to add MIN_STATES states. Since
+ // 'init_cache' above adds 3, this adds a 4th.
+ .expect("adding one state after cache clear must work");
+ self.cache.state_saver = StateSaver::Saved(new_id);
+ }
+ }
+
+ /// Initialize this cache from emptiness to a place where it can be used
+ /// for search.
+ ///
+ /// This is called both at cache creation time and after the cache has been
+ /// cleared.
+ ///
+ /// Primarily, this adds the three sentinel states and allocates some
+ /// initial memory.
+ fn init_cache(&mut self) {
+ let mut starts_len = Start::count();
+ if self.dfa.starts_for_each_pattern {
+ starts_len += Start::count() * self.dfa.pattern_count();
+ }
+ self.cache
+ .starts
+ .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len));
+ // This is the set of NFA states that corresponds to each of our three
+ // sentinel states: the empty set.
+ let dead = State::dead();
+ // This sets up some states that we use as sentinels that are present
+ // in every DFA. While it would be technically possible to implement
+ // this DFA without explicitly putting these states in the transition
+ // table, this is convenient to do to make `next_state` correct for all
+ // valid state IDs without needing explicit conditionals to special
+ // case these sentinel states.
+ //
+ // All three of these states are "dead" states. That is, all of
+ // them transition only to themselves. So once you enter one of
+ // these states, it's impossible to leave them. Thus, any correct
+ // search routine must explicitly check for these state types. (Sans
+ // `unknown`, since that is only used internally to represent missing
+ // states.)
+ let unk_id =
+ self.add_state(dead.clone(), |id| id.to_unknown()).unwrap();
+ let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap();
+ let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap();
+ assert_eq!(unk_id, self.as_ref().unknown_id());
+ assert_eq!(dead_id, self.as_ref().dead_id());
+ assert_eq!(quit_id, self.as_ref().quit_id());
+ // The idea here is that if you start in an unknown/dead/quit state and
+ // try to transition on them, then you should end up where you started.
+ self.set_all_transitions(unk_id, unk_id);
+ self.set_all_transitions(dead_id, dead_id);
+ self.set_all_transitions(quit_id, quit_id);
+ // All of these states are technically equivalent from the FSM
+ // perspective, so putting all three of them in the cache isn't
+ // possible. (They are distinct merely because we use their
+ // identifiers as sentinels to mean something, as indicated by the
+ // names.) Moreover, we wouldn't want to do that. Unknown and quit
+ // states are special in that they are artificial constructions
+ // this implementation. But dead states are a natural part of
+ // determinization. When you reach a point in the NFA where you cannot
+ // go anywhere else, a dead state will naturally arise and we MUST
+ // reuse the canonical dead state that we've created here. Why? Because
+ // it is the state ID that tells the search routine whether a state is
+ // dead or not, and thus, whether to stop the search. Having a bunch of
+ // distinct dead states would be quite wasteful!
+ self.cache.states_to_id.insert(dead, dead_id);
+ }
+
+ /// Save the state corresponding to the ID given such that the state
+ /// persists through a cache clearing.
+ ///
+ /// While the state may persist, the ID may not. In order to discover the
+ /// new state ID, one must call 'saved_state_id' after a cache clearing.
+ fn save_state(&mut self, id: LazyStateID) {
+ let state = self.as_ref().get_cached_state(id).clone();
+ self.cache.state_saver = StateSaver::ToSave { id, state };
+ }
+
+ /// Returns the updated lazy state ID for a state that was persisted
+ /// through a cache clearing.
+ ///
+ /// It is only correct to call this routine when both a state has been
+ /// saved and the cache has just been cleared. Otherwise, this panics.
+ fn saved_state_id(&mut self) -> LazyStateID {
+ self.cache
+ .state_saver
+ .take_saved()
+ .expect("state saver does not have saved state ID")
+ }
+
+ /// Set all transitions on the state 'from' to 'to'.
+ fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) {
+ for unit in self.dfa.classes.representatives() {
+ self.set_transition(from, unit, to);
+ }
+ }
+
+ /// Set the transition on 'from' for 'unit' to 'to'.
+ ///
+ /// This panics if either 'from' or 'to' is invalid.
+ ///
+ /// All unit values are OK.
+ fn set_transition(
+ &mut self,
+ from: LazyStateID,
+ unit: alphabet::Unit,
+ to: LazyStateID,
+ ) {
+ assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from);
+ assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to);
+ let offset =
+ from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit);
+ self.cache.trans[offset] = to;
+ }
+
+ /// Set the start ID for the given pattern ID (if given) and starting
+ /// configuration to the ID given.
+ ///
+ /// This panics if 'id' is not valid or if a pattern ID is given and
+ /// 'starts_for_each_pattern' is not enabled.
+ fn set_start_state(
+ &mut self,
+ pattern_id: Option<PatternID>,
+ start: Start,
+ id: LazyStateID,
+ ) {
+ assert!(self.as_ref().is_valid(id));
+ let start_index = start.as_usize();
+ let index = match pattern_id {
+ None => start_index,
+ Some(pid) => {
+ assert!(
+ self.dfa.starts_for_each_pattern,
+ "attempted to search for a specific pattern \
+ without enabling starts_for_each_pattern",
+ );
+ let pid = pid.as_usize();
+ Start::count() + (Start::count() * pid) + start_index
+ }
+ };
+ self.cache.starts[index] = id;
+ }
+
+ /// Returns a state builder from this DFA that might have existing
+ /// capacity. This helps avoid allocs in cases where a state is built that
+ /// turns out to already be cached.
+ ///
+ /// Callers must put the state builder back with 'put_state_builder',
+ /// otherwise the allocation reuse won't work.
+ fn get_state_builder(&mut self) -> StateBuilderEmpty {
+ core::mem::replace(
+ &mut self.cache.scratch_state_builder,
+ StateBuilderEmpty::new(),
+ )
+ }
+
+ /// Puts the given state builder back into this DFA for reuse.
+ ///
+ /// Note that building a 'State' from a builder always creates a new alloc,
+ /// so callers should always put the builder back.
+ fn put_state_builder(&mut self, builder: StateBuilderNFA) {
+ let _ = core::mem::replace(
+ &mut self.cache.scratch_state_builder,
+ builder.clear(),
+ );
+ }
+}
+
+/// A type that groups methods that require the base NFA/DFA and read-only
+/// access to the cache.
+#[derive(Debug)]
+struct LazyRef<'i, 'c> {
+ dfa: &'i DFA,
+ cache: &'c Cache,
+}
+
+impl<'i, 'c> LazyRef<'i, 'c> {
+ /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache.
+ fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> {
+ LazyRef { dfa, cache }
+ }
+
+ /// Return the ID of the start state for the given configuration.
+ ///
+ /// If the start state has not yet been computed, then this returns an
+ /// unknown lazy state ID.
+ fn get_cached_start_id(
+ &self,
+ pattern_id: Option<PatternID>,
+ start: Start,
+ ) -> LazyStateID {
+ let start_index = start.as_usize();
+ let index = match pattern_id {
+ None => start_index,
+ Some(pid) => {
+ let pid = pid.as_usize();
+ assert!(
+ pid < self.dfa.pattern_count(),
+ "invalid pattern ID: {:?}",
+ pid
+ );
+ Start::count() + (Start::count() * pid) + start_index
+ }
+ };
+ self.cache.starts[index]
+ }
+
+ /// Return the cached NFA/DFA powerset state for the given ID.
+ ///
+ /// This panics if the given ID does not address a valid state.
+ fn get_cached_state(&self, sid: LazyStateID) -> &State {
+ let index = sid.as_usize_untagged() >> self.dfa.stride2();
+ &self.cache.states[index]
+ }
+
+ /// Returns true if and only if the given ID corresponds to a "sentinel"
+ /// state.
+ ///
+ /// A sentinel state is a state that signifies a special condition of
+ /// search, and where every transition maps back to itself. See LazyStateID
+ /// for more details. Note that start and match states are _not_ sentinels
+ /// since they may otherwise be real states with non-trivial transitions.
+ /// The purposes of sentinel states is purely to indicate something. Their
+ /// transitions are not meant to be followed.
+ fn is_sentinel(&self, id: LazyStateID) -> bool {
+ id == self.unknown_id() || id == self.dead_id() || id == self.quit_id()
+ }
+
+ /// Returns the ID of the unknown state for this lazy DFA.
+ fn unknown_id(&self) -> LazyStateID {
+ // This unwrap is OK since 0 is always a valid state ID.
+ LazyStateID::new(0).unwrap().to_unknown()
+ }
+
+ /// Returns the ID of the dead state for this lazy DFA.
+ fn dead_id(&self) -> LazyStateID {
+ // This unwrap is OK since the maximum value here is 1 * 512 = 512,
+ // which is <= 2047 (the maximum state ID on 16-bit systems). Where
+ // 512 is the worst case for our equivalence classes (every byte is a
+ // distinct class).
+ LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead()
+ }
+
+ /// Returns the ID of the quit state for this lazy DFA.
+ fn quit_id(&self) -> LazyStateID {
+ // This unwrap is OK since the maximum value here is 2 * 512 = 1024,
+ // which is <= 2047 (the maximum state ID on 16-bit systems). Where
+ // 512 is the worst case for our equivalence classes (every byte is a
+ // distinct class).
+ LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit()
+ }
+
+ /// Returns true if and only if the given ID is valid.
+ ///
+ /// An ID is valid if it is both a valid index into the transition table
+ /// and is a multiple of the DFA's stride.
+ fn is_valid(&self, id: LazyStateID) -> bool {
+ let id = id.as_usize_untagged();
+ id < self.cache.trans.len() && id % self.dfa.stride() == 0
+ }
+
+ /// Returns true if adding the state given would fit in this cache.
+ fn state_fits_in_cache(&self, state: &State) -> bool {
+ let needed = self.cache.memory_usage()
+ + self.memory_usage_for_one_more_state(state.memory_usage());
+ needed <= self.dfa.cache_capacity
+ }
+
+ /// Returns true if adding the state to be built by the given builder would
+ /// fit in this cache.
+ fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool {
+ let needed = self.cache.memory_usage()
+ + self.memory_usage_for_one_more_state(state.as_bytes().len());
+ needed <= self.dfa.cache_capacity
+ }
+
+ /// Returns the additional memory usage, in bytes, required to add one more
+ /// state to this cache. The given size should be the heap size, in bytes,
+ /// that would be used by the new state being added.
+ fn memory_usage_for_one_more_state(
+ &self,
+ state_heap_size: usize,
+ ) -> usize {
+ const ID_SIZE: usize = size_of::<LazyStateID>();
+ const STATE_SIZE: usize = size_of::<State>();
+
+ self.dfa.stride() * ID_SIZE // additional space needed in trans table
+ + STATE_SIZE // space in cache.states
+ + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id
+ + state_heap_size // heap memory used by state itself
+ }
+}
+
+/// A simple type that encapsulates the saving of a state ID through a cache
+/// clearing.
+///
+/// A state ID can be marked for saving with ToSave, while a state ID can be
+/// saved itself with Saved.
+#[derive(Clone, Debug)]
+enum StateSaver {
+ /// An empty state saver. In this case, no states (other than the special
+ /// sentinel states) are preserved after clearing the cache.
+ None,
+ /// An ID of a state (and the state itself) that should be preserved after
+ /// the lazy DFA's cache has been cleared. After clearing, the updated ID
+ /// is stored in 'Saved' since it may have changed.
+ ToSave { id: LazyStateID, state: State },
+ /// An ID that of a state that has been persisted through a lazy DFA
+ /// cache clearing. The ID recorded here corresonds to an ID that was
+ /// once marked as ToSave. The IDs are likely not equivalent even though
+ /// the states they point to are.
+ Saved(LazyStateID),
+}
+
+impl StateSaver {
+ /// Create an empty state saver.
+ fn none() -> StateSaver {
+ StateSaver::None
+ }
+
+ /// Replace this state saver with an empty saver, and if this saver is a
+ /// request to save a state, return that request.
+ fn take_to_save(&mut self) -> Option<(LazyStateID, State)> {
+ match core::mem::replace(self, StateSaver::None) {
+ StateSaver::None | StateSaver::Saved(_) => None,
+ StateSaver::ToSave { id, state } => Some((id, state)),
+ }
+ }
+
+ /// Replace this state saver with an empty saver, and if this saver is a
+ /// saved state (or a request to save a state), return that state's ID.
+ ///
+ /// The idea here is that a request to save a state isn't necessarily
+ /// honored because it might not be needed. e.g., Some higher level code
+ /// might request a state to be saved on the off chance that the cache gets
+ /// cleared when a new state is added at a lower level. But if that new
+ /// state is never added, then the cache is never cleared and the state and
+ /// its ID remain unchanged.
+ fn take_saved(&mut self) -> Option<LazyStateID> {
+ match core::mem::replace(self, StateSaver::None) {
+ StateSaver::None => None,
+ StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id),
+ }
+ }
+}
+
+/// The configuration used for building a lazy DFA.
+///
+/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
+/// advantage of the former is that it often lets you avoid importing the
+/// `Config` type directly.
+///
+/// A lazy DFA configuration is a simple data object that is typically used
+/// with [`Builder::configure`].
+///
+/// The default configuration guarantees that a search will _never_ return
+/// a [`MatchError`] for any haystack or pattern. Setting a quit byte with
+/// [`Config::quit`], enabling heuristic support for Unicode word boundaries
+/// with [`Config::unicode_word_boundary`], or setting a minimum cache clear
+/// count with [`Config::minimum_cache_clear_count`] can in turn cause a search
+/// to return an error. See the corresponding configuration options for more
+/// details on when those error conditions arise.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ // As with other configuration types in this crate, we put all our knobs
+ // in options so that we can distinguish between "default" and "not set."
+ // This makes it possible to easily combine multiple configurations
+ // without default values overwriting explicitly specified values. See the
+ // 'overwrite' method.
+ //
+ // For docs on the fields below, see the corresponding method setters.
+ anchored: Option<bool>,
+ match_kind: Option<MatchKind>,
+ starts_for_each_pattern: Option<bool>,
+ byte_classes: Option<bool>,
+ unicode_word_boundary: Option<bool>,
+ quitset: Option<ByteSet>,
+ cache_capacity: Option<usize>,
+ skip_cache_capacity_check: Option<bool>,
+ minimum_cache_clear_count: Option<Option<usize>>,
+}
+
+impl Config {
+ /// Return a new default lazy DFA builder configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Set whether matching must be anchored at the beginning of the input.
+ ///
+ /// When enabled, a match must begin at the start of a search. When
+ /// disabled (the default), the lazy DFA will act as if the pattern started
+ /// with a `(?s:.)*?`, which enables a match to appear anywhere.
+ ///
+ /// Note that if you want to run both anchored and unanchored
+ /// searches without building multiple automatons, you can enable the
+ /// [`Config::starts_for_each_pattern`] configuration instead. This will
+ /// permit unanchored any-pattern searches and pattern-specific anchored
+ /// searches. See the documentation for that configuration for an example.
+ ///
+ /// By default this is disabled.
+ ///
+ /// **WARNING:** this is subtly different than using a `^` at the start of
+ /// your regex. A `^` forces a regex to match exclusively at the start of
+ /// input, regardless of where you begin your search. In contrast, enabling
+ /// this option will allow your regex to match anywhere in your input,
+ /// but the match must start at the beginning of a search. (Most of the
+ /// higher level convenience search routines make "start of input" and
+ /// "start of search" equivalent, but some routines allow treating these as
+ /// orthogonal.)
+ ///
+ /// For example, consider the haystack `aba` and the following searches:
+ ///
+ /// 1. The regex `^a` is compiled with `anchored=false` and searches
+ /// `aba` starting at position `2`. Since `^` requires the match to
+ /// start at the beginning of the input and `2 > 0`, no match is found.
+ /// 2. The regex `a` is compiled with `anchored=true` and searches `aba`
+ /// starting at position `2`. This reports a match at `[2, 3]` since
+ /// the match starts where the search started. Since there is no `^`,
+ /// there is no requirement for the match to start at the beginning of
+ /// the input.
+ /// 3. The regex `a` is compiled with `anchored=true` and searches `aba`
+ /// starting at position `1`. Since `b` corresponds to position `1` and
+ /// since the regex is anchored, it finds no match.
+ /// 4. The regex `a` is compiled with `anchored=false` and searches `aba`
+ /// startting at position `1`. Since the regex is neither anchored nor
+ /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?`
+ /// prefix that permits it to match anywhere. Thus, it reports a match
+ /// at `[2, 3]`.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates the differences between an anchored search and
+ /// a pattern that begins with `^` (as described in the above warning
+ /// message).
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch};
+ ///
+ /// let haystack = "aba".as_bytes();
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().anchored(false)) // default
+ /// .build(r"^a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let got = dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 2, 3,
+ /// )?;
+ /// // No match is found because 2 is not the beginning of the haystack,
+ /// // which is what ^ requires.
+ /// let expected = None;
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().anchored(true))
+ /// .build(r"a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let got = dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 2, 3,
+ /// )?;
+ /// // An anchored search can still match anywhere in the haystack, it just
+ /// // must begin at the start of the search which is '2' in this case.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().anchored(true))
+ /// .build(r"a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let got = dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 1, 3,
+ /// )?;
+ /// // No match is found since we start searching at offset 1 which
+ /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
+ /// // is found.
+ /// let expected = None;
+ /// assert_eq!(expected, got);
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().anchored(false))
+ /// .build(r"a")?;
+ /// let mut cache = dfa.create_cache();
+ /// let got = dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 1, 3,
+ /// )?;
+ /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the
+ /// // pattern. Even though the search starts at 'b', the 'match anything'
+ /// // prefix allows the search to match 'a'.
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn anchored(mut self, yes: bool) -> Config {
+ self.anchored = Some(yes);
+ self
+ }
+
+ /// Set the desired match semantics.
+ ///
+ /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
+ /// match semantics of Perl-like regex engines. That is, when multiple
+ /// patterns would match at the same leftmost position, the pattern that
+ /// appears first in the concrete syntax is chosen.
+ ///
+ /// Currently, the only other kind of match semantics supported is
+ /// [`MatchKind::All`]. This corresponds to classical DFA construction
+ /// where all possible matches are added to the lazy DFA.
+ ///
+ /// Typically, `All` is used when one wants to execute an overlapping
+ /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
+ /// sense to use `All` with the various "leftmost" find routines, since the
+ /// leftmost routines depend on the `LeftmostFirst` automata construction
+ /// strategy. Specifically, `LeftmostFirst` adds dead states to the
+ /// lazy DFA as a way to terminate the search and report a match.
+ /// `LeftmostFirst` also supports non-greedy matches using this strategy
+ /// where as `All` does not.
+ ///
+ /// # Example: overlapping search
+ ///
+ /// This example shows the typical use of `MatchKind::All`, which is to
+ /// report overlapping matches.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, OverlappingState},
+ /// HalfMatch, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(HalfMatch::must(1, 4));
+ /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(HalfMatch::must(0, 4));
+ /// let got = dfa.find_overlapping_fwd(&mut cache, haystack, &mut state)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// # Example: reverse automaton to find start of match
+ ///
+ /// Another example for using `MatchKind::All` is for constructing a
+ /// reverse automaton to find the start of a match. `All` semantics are
+ /// used for this in order to find the longest possible match, which
+ /// corresponds to the leftmost starting position.
+ ///
+ /// Note that if you need the starting position then
+ /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this
+ /// for you, so it's usually not necessary to do this yourself.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchKind};
+ ///
+ /// let haystack = "123foobar456".as_bytes();
+ /// let pattern = r"[a-z]+";
+ ///
+ /// let dfa_fwd = DFA::new(pattern)?;
+ /// let dfa_rev = DFA::builder()
+ /// .configure(DFA::config()
+ /// .anchored(true)
+ /// .match_kind(MatchKind::All)
+ /// )
+ /// .build(pattern)?;
+ /// let mut cache_fwd = dfa_fwd.create_cache();
+ /// let mut cache_rev = dfa_rev.create_cache();
+ ///
+ /// let expected_fwd = HalfMatch::must(0, 9);
+ /// let expected_rev = HalfMatch::must(0, 3);
+ /// let got_fwd = dfa_fwd.find_leftmost_fwd(
+ /// &mut cache_fwd, haystack,
+ /// )?.unwrap();
+ /// // Here we don't specify the pattern to search for since there's only
+ /// // one pattern and we're doing a leftmost search. But if this were an
+ /// // overlapping search, you'd need to specify the pattern that matched
+ /// // in the forward direction. (Otherwise, you might wind up finding the
+ /// // starting position of a match of some other pattern.) That in turn
+ /// // requires building the reverse automaton with starts_for_each_pattern
+ /// // enabled. Indeed, this is what Regex does internally.
+ /// let got_rev = dfa_rev.find_leftmost_rev_at(
+ /// &mut cache_rev, None, haystack, 0, got_fwd.offset(),
+ /// )?.unwrap();
+ /// assert_eq!(expected_fwd, got_fwd);
+ /// assert_eq!(expected_rev, got_rev);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn match_kind(mut self, kind: MatchKind) -> Config {
+ self.match_kind = Some(kind);
+ self
+ }
+
+ /// Whether to compile a separate start state for each pattern in the
+ /// lazy DFA.
+ ///
+ /// When enabled, a separate **anchored** start state is added for each
+ /// pattern in the lazy DFA. When this start state is used, then the DFA
+ /// will only search for matches for the pattern specified, even if there
+ /// are other patterns in the DFA.
+ ///
+ /// The main downside of this option is that it can potentially increase
+ /// the size of the DFA and/or increase the time it takes to build the
+ /// DFA at search time. However, since this is configuration for a lazy
+ /// DFA, these states aren't actually built unless they're used. Enabling
+ /// this isn't necessarily free, however, as it may result in higher cache
+ /// usage.
+ ///
+ /// There are a few reasons one might want to enable this (it's disabled
+ /// by default):
+ ///
+ /// 1. When looking for the start of an overlapping match (using a reverse
+ /// DFA), doing it correctly requires starting the reverse search using the
+ /// starting state of the pattern that matched in the forward direction.
+ /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it
+ /// will automatically enable this option when building the reverse DFA
+ /// internally.
+ /// 2. When you want to use a DFA with multiple patterns to both search
+ /// for matches of any pattern or to search for anchored matches of one
+ /// particular pattern while using the same DFA. (Otherwise, you would need
+ /// to compile a new DFA for each pattern.)
+ /// 3. Since the start states added for each pattern are anchored, if you
+ /// compile an unanchored DFA with one pattern while also enabling this
+ /// option, then you can use the same DFA to perform anchored or unanchored
+ /// searches. The latter you get with the standard search APIs. The former
+ /// you get from the various `_at` search methods that allow you specify a
+ /// pattern ID to search for.
+ ///
+ /// By default this is disabled.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use this option to permit the same lazy DFA
+ /// to run both anchored and unanchored searches for a single pattern.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, PatternID};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().starts_for_each_pattern(true))
+ /// .build(r"foo[0-9]+")?;
+ /// let mut cache = dfa.create_cache();
+ /// let haystack = b"quux foo123";
+ ///
+ /// // Here's a normal unanchored search. Notice that we use 'None' for the
+ /// // pattern ID. Since the DFA was built as an unanchored machine, it
+ /// // uses its default unanchored starting state.
+ /// let expected = HalfMatch::must(0, 11);
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, None, haystack, 0, haystack.len(),
+ /// )?);
+ /// // But now if we explicitly specify the pattern to search ('0' being
+ /// // the only pattern in the DFA), then it will use the starting state
+ /// // for that specific pattern which is always anchored. Since the
+ /// // pattern doesn't have a match at the beginning of the haystack, we
+ /// // find nothing.
+ /// assert_eq!(None, dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, Some(PatternID::must(0)), haystack, 0, haystack.len(),
+ /// )?);
+ /// // And finally, an anchored search is not the same as putting a '^' at
+ /// // beginning of the pattern. An anchored search can only match at the
+ /// // beginning of the *search*, which we can change:
+ /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
+ /// &mut cache, None, Some(PatternID::must(0)), haystack, 5, haystack.len(),
+ /// )?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
+ self.starts_for_each_pattern = Some(yes);
+ self
+ }
+
+ /// Whether to attempt to shrink the size of the lazy DFA's alphabet or
+ /// not.
+ ///
+ /// This option is enabled by default and should never be disabled unless
+ /// one is debugging the lazy DFA.
+ ///
+ /// When enabled, the lazy DFA will use a map from all possible bytes
+ /// to their corresponding equivalence class. Each equivalence class
+ /// represents a set of bytes that does not discriminate between a match
+ /// and a non-match in the DFA. For example, the pattern `[ab]+` has at
+ /// least two equivalence classes: a set containing `a` and `b` and a set
+ /// containing every byte except for `a` and `b`. `a` and `b` are in the
+ /// same equivalence classes because they never discriminate between a
+ /// match and a non-match.
+ ///
+ /// The advantage of this map is that the size of the transition table
+ /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)`
+ /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of
+ /// equivalence classes (rounded up to the nearest power of 2). As a
+ /// result, total space usage can decrease substantially. Moreover, since a
+ /// smaller alphabet is used, DFA compilation during search becomes faster
+ /// as well since it will potentially be able to reuse a single transition
+ /// for multiple bytes.
+ ///
+ /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling
+ /// this does not yield any speed advantages. Namely, even when this is
+ /// disabled, a byte class map is still used while searching. The only
+ /// difference is that every byte will be forced into its own distinct
+ /// equivalence class. This is useful for debugging the actual generated
+ /// transitions because it lets one see the transitions defined on actual
+ /// bytes instead of the equivalence classes.
+ pub fn byte_classes(mut self, yes: bool) -> Config {
+ self.byte_classes = Some(yes);
+ self
+ }
+
+ /// Heuristically enable Unicode word boundaries.
+ ///
+ /// When set, this will attempt to implement Unicode word boundaries as if
+ /// they were ASCII word boundaries. This only works when the search input
+ /// is ASCII only. If a non-ASCII byte is observed while searching, then a
+ /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned.
+ ///
+ /// A possible alternative to enabling this option is to simply use an
+ /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
+ /// option is if you absolutely need Unicode support. This option lets one
+ /// use a fast search implementation (a DFA) for some potentially very
+ /// common cases, while providing the option to fall back to some other
+ /// regex engine to handle the general case when an error is returned.
+ ///
+ /// If the pattern provided has no Unicode word boundary in it, then this
+ /// option has no effect. (That is, quitting on a non-ASCII byte only
+ /// occurs when this option is enabled _and_ a Unicode word boundary is
+ /// present in the pattern.)
+ ///
+ /// This is almost equivalent to setting all non-ASCII bytes to be quit
+ /// bytes. The only difference is that this will cause non-ASCII bytes to
+ /// be quit bytes _only_ when a Unicode word boundary is present in the
+ /// pattern.
+ ///
+ /// When enabling this option, callers _must_ be prepared to handle
+ /// a [`MatchError`](crate::MatchError) error during search.
+ /// When using a [`Regex`](crate::hybrid::regex::Regex), this
+ /// corresponds to using the `try_` suite of methods. Alternatively,
+ /// if callers can guarantee that their input is ASCII only, then a
+ /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be
+ /// returned while searching.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to heuristically enable Unicode word boundaries
+ /// in a pattern. It also shows what happens when a search comes across a
+ /// non-ASCII byte.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::dfa::DFA,
+ /// HalfMatch, MatchError, MatchKind,
+ /// };
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().unicode_word_boundary(true))
+ /// .build(r"\b[0-9]+\b")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// // The match occurs before the search ever observes the snowman
+ /// // character, so no error occurs.
+ /// let haystack = "foo 123 ☃".as_bytes();
+ /// let expected = Some(HalfMatch::must(0, 7));
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// // Notice that this search fails, even though the snowman character
+ /// // occurs after the ending match offset. This is because search
+ /// // routines read one byte past the end of the search to account for
+ /// // look-around, and indeed, this is required here to determine whether
+ /// // the trailing \b matches.
+ /// let haystack = "foo 123☃".as_bytes();
+ /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 };
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack);
+ /// assert_eq!(Err(expected), got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
+ // We have a separate option for this instead of just setting the
+ // appropriate quit bytes here because we don't want to set quit bytes
+ // for every regex. We only want to set them when the regex contains a
+ // Unicode word boundary.
+ self.unicode_word_boundary = Some(yes);
+ self
+ }
+
+ /// Add a "quit" byte to the lazy DFA.
+ ///
+ /// When a quit byte is seen during search time, then search will return
+ /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the
+ /// offset at which the search stopped.
+ ///
+ /// A quit byte will always overrule any other aspects of a regex. For
+ /// example, if the `x` byte is added as a quit byte and the regex `\w` is
+ /// used, then observing `x` will cause the search to quit immediately
+ /// despite the fact that `x` is in the `\w` class.
+ ///
+ /// This mechanism is primarily useful for heuristically enabling certain
+ /// features like Unicode word boundaries in a DFA. Namely, if the input
+ /// to search is ASCII, then a Unicode word boundary can be implemented
+ /// via an ASCII word boundary with no change in semantics. Thus, a DFA
+ /// can attempt to match a Unicode word boundary but give up as soon as it
+ /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
+ /// to be quit bytes, then Unicode word boundaries will be permitted when
+ /// building lazy DFAs. Of course, callers should enable
+ /// [`Config::unicode_word_boundary`] if they want this behavior instead.
+ /// (The advantage being that non-ASCII quit bytes will only be added if a
+ /// Unicode word boundary is in the pattern.)
+ ///
+ /// When enabling this option, callers _must_ be prepared to handle a
+ /// [`MatchError`](crate::MatchError) error during search. When using a
+ /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the
+ /// `try_` suite of methods.
+ ///
+ /// By default, there are no quit bytes set.
+ ///
+ /// # Panics
+ ///
+ /// This panics if heuristic Unicode word boundaries are enabled and any
+ /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
+ /// Unicode word boundaries requires setting every non-ASCII byte to a quit
+ /// byte. So if the caller attempts to undo any of that, then this will
+ /// panic.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to cause a search to terminate if it sees a
+ /// `\n` byte. This could be useful if, for example, you wanted to prevent
+ /// a user supplied pattern from matching across a line boundary.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ ///
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().quit(b'\n', true))
+ /// .build(r"foo\p{any}+bar")?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "foo\nbar".as_bytes();
+ /// // Normally this would produce a match, since \p{any} contains '\n'.
+ /// // But since we instructed the automaton to enter a quit state if a
+ /// // '\n' is observed, this produces a match error instead.
+ /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack).unwrap_err();
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn quit(mut self, byte: u8, yes: bool) -> Config {
+ if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
+ panic!(
+ "cannot set non-ASCII byte to be non-quit when \
+ Unicode word boundaries are enabled"
+ );
+ }
+ if self.quitset.is_none() {
+ self.quitset = Some(ByteSet::empty());
+ }
+ if yes {
+ self.quitset.as_mut().unwrap().add(byte);
+ } else {
+ self.quitset.as_mut().unwrap().remove(byte);
+ }
+ self
+ }
+
+ /// Sets the maximum amount of heap memory, in bytes, to allocate to the
+ /// cache for use during a lazy DFA search. If the lazy DFA would otherwise
+ /// use more heap memory, then, depending on other configuration knobs,
+ /// either stop the search and return an error or clear the cache and
+ /// continue the search.
+ ///
+ /// The default cache capacity is some "reasonable" number that will
+ /// accommodate most regular expressions. You may find that if you need
+ /// to build a large DFA then it may be necessary to increase the cache
+ /// capacity.
+ ///
+ /// Note that while building a lazy DFA will do a "minimum" check to ensure
+ /// the capacity is big enough, this is more or less about correctness.
+ /// If the cache is bigger than the minimum but still too small, then the
+ /// lazy DFA could wind up spending a lot of time clearing the cache and
+ /// recomputing transitions, thus negating the performance benefits of a
+ /// lazy DFA. Thus, setting the cache capacity is mostly an experimental
+ /// endeavor. For most common patterns, however, the default should be
+ /// sufficient.
+ ///
+ /// For more details on how the lazy DFA's cache is used, see the
+ /// documentation for [`Cache`].
+ ///
+ /// # Example
+ ///
+ /// This example shows what happens if the configured cache capacity is
+ /// too small. In such cases, one can override the cache capacity to make
+ /// it bigger. Alternatively, one might want to use less memory by setting
+ /// a smaller cache capacity.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ ///
+ /// let pattern = r"\p{L}{1000}";
+ ///
+ /// // The default cache capacity is likely too small to deal with regexes
+ /// // that are very large. Large repetitions of large Unicode character
+ /// // classes are a common way to make very large regexes.
+ /// let _ = DFA::new(pattern).unwrap_err();
+ /// // Bump up the capacity to something bigger.
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB
+ /// .build(pattern)?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
+ /// let expected = Some(HalfMatch::must(0, 2000));
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn cache_capacity(mut self, bytes: usize) -> Config {
+ self.cache_capacity = Some(bytes);
+ self
+ }
+
+ /// Configures construction of a lazy DFA to use the minimum cache capacity
+ /// if the configured capacity is otherwise too small for the provided NFA.
+ ///
+ /// This is useful if you never want lazy DFA construction to fail because
+ /// of a capacity that is too small.
+ ///
+ /// In general, this option is typically not a good idea. In particular,
+ /// while a minimum cache capacity does permit the lazy DFA to function
+ /// where it otherwise couldn't, it's plausible that it may not function
+ /// well if it's constantly running out of room. In that case, the speed
+ /// advantages of the lazy DFA may be negated.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows what happens if the configured cache capacity is
+ /// too small. In such cases, one could override the capacity explicitly.
+ /// An alternative, demonstrated here, let's us force construction to use
+ /// the minimum cache capacity if the configured capacity is otherwise
+ /// too small.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError};
+ ///
+ /// let pattern = r"\p{L}{1000}";
+ ///
+ /// // The default cache capacity is likely too small to deal with regexes
+ /// // that are very large. Large repetitions of large Unicode character
+ /// // classes are a common way to make very large regexes.
+ /// let _ = DFA::new(pattern).unwrap_err();
+ /// // Configure construction such it automatically selects the minimum
+ /// // cache capacity if it would otherwise be too small.
+ /// let dfa = DFA::builder()
+ /// .configure(DFA::config().skip_cache_capacity_check(true))
+ /// .build(pattern)?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50);
+ /// let expected = Some(HalfMatch::must(0, 2000));
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack.as_bytes())?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config {
+ self.skip_cache_capacity_check = Some(yes);
+ self
+ }
+
+ /// Configure a lazy DFA search to quit after a certain number of cache
+ /// clearings.
+ ///
+ /// When a minimum is set, then a lazy DFA search will "give up" after
+ /// the minimum number of cache clearings has occurred. This is typically
+ /// useful in scenarios where callers want to detect whether the lazy DFA
+ /// search is "efficient" or not. If the cache is cleared too many times,
+ /// this is a good indicator that it is not efficient, and thus, the caller
+ /// may wish to use some other regex engine.
+ ///
+ /// Note that the number of times a cache is cleared is a property of
+ /// the cache itself. Thus, if a cache is used in a subsequent search
+ /// with a similarly configured lazy DFA, then it would cause the
+ /// search to "give up" if the cache needed to be cleared. The cache
+ /// clear count can only be reset to `0` via [`DFA::reset_cache`] (or
+ /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if
+ /// you're using the `Regex` API).
+ ///
+ /// By default, no minimum is configured. Thus, a lazy DFA search will
+ /// never give up due to cache clearings.
+ ///
+ /// # Example
+ ///
+ /// This example uses a somewhat pathological configuration to demonstrate
+ /// the _possible_ behavior of cache clearing and how it might result
+ /// in a search that returns an error.
+ ///
+ /// It is important to note that the precise mechanics of how and when
+ /// a cache gets cleared is an implementation detail. Thus, the asserts
+ /// in the tests below with respect to the particular offsets at which a
+ /// search gave up should be viewed strictly as a demonstration. They are
+ /// not part of any API guarantees offered by this crate.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::dfa::DFA, MatchError};
+ ///
+ /// // This is a carefully chosen regex. The idea is to pick one
+ /// // that requires some decent number of states (hence the bounded
+ /// // repetition). But we specifically choose to create a class with an
+ /// // ASCII letter and a non-ASCII letter so that we can check that no new
+ /// // states are created once the cache is full. Namely, if we fill up the
+ /// // cache on a haystack of 'a's, then in order to match one 'β', a new
+ /// // state will need to be created since a 'β' is encoded with multiple
+ /// // bytes. Since there's no room for this state, the search should quit
+ /// // at the very first position.
+ /// let pattern = r"[aβ]{100}";
+ /// let dfa = DFA::builder()
+ /// .configure(
+ /// // Configure it so that we have the minimum cache capacity
+ /// // possible. And that if any clearings occur, the search quits.
+ /// DFA::config()
+ /// .skip_cache_capacity_check(true)
+ /// .cache_capacity(0)
+ /// .minimum_cache_clear_count(Some(0)),
+ /// )
+ /// .build(pattern)?;
+ /// let mut cache = dfa.create_cache();
+ ///
+ /// let haystack = "a".repeat(101).into_bytes();
+ /// assert_eq!(
+ /// dfa.find_leftmost_fwd(&mut cache, &haystack),
+ /// Err(MatchError::GaveUp { offset: 25 }),
+ /// );
+ ///
+ /// // Now that we know the cache is full, if we search a haystack that we
+ /// // know will require creating at least one new state, it should not
+ /// // be able to make any progress.
+ /// let haystack = "β".repeat(101).into_bytes();
+ /// assert_eq!(
+ /// dfa.find_leftmost_fwd(&mut cache, &haystack),
+ /// Err(MatchError::GaveUp { offset: 0 }),
+ /// );
+ ///
+ /// // If we reset the cache, then we should be able to create more states
+ /// // and make more progress with searching for betas.
+ /// cache.reset(&dfa);
+ /// let haystack = "β".repeat(101).into_bytes();
+ /// assert_eq!(
+ /// dfa.find_earliest_fwd(&mut cache, &haystack),
+ /// Err(MatchError::GaveUp { offset: 26 }),
+ /// );
+ ///
+ /// // ... switching back to ASCII still makes progress since it just needs
+ /// // to set transitions on existing states!
+ /// let haystack = "a".repeat(101).into_bytes();
+ /// assert_eq!(
+ /// dfa.find_earliest_fwd(&mut cache, &haystack),
+ /// Err(MatchError::GaveUp { offset: 13 }),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn minimum_cache_clear_count(mut self, min: Option<usize>) -> Config {
+ self.minimum_cache_clear_count = Some(min);
+ self
+ }
+
+ /// Returns whether this configuration has enabled anchored searches.
+ pub fn get_anchored(&self) -> bool {
+ self.anchored.unwrap_or(false)
+ }
+
+ /// Returns the match semantics set in this configuration.
+ pub fn get_match_kind(&self) -> MatchKind {
+ self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
+ }
+
+ /// Returns whether this configuration has enabled anchored starting states
+ /// for every pattern in the DFA.
+ pub fn get_starts_for_each_pattern(&self) -> bool {
+ self.starts_for_each_pattern.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration has enabled byte classes or not.
+ /// This is typically a debugging oriented option, as disabling it confers
+ /// no speed benefit.
+ pub fn get_byte_classes(&self) -> bool {
+ self.byte_classes.unwrap_or(true)
+ }
+
+ /// Returns whether this configuration has enabled heuristic Unicode word
+ /// boundary support. When enabled, it is possible for a search to return
+ /// an error.
+ pub fn get_unicode_word_boundary(&self) -> bool {
+ self.unicode_word_boundary.unwrap_or(false)
+ }
+
+ /// Returns whether this configuration will instruct the DFA to enter a
+ /// quit state whenever the given byte is seen during a search. When at
+ /// least one byte has this enabled, it is possible for a search to return
+ /// an error.
+ pub fn get_quit(&self, byte: u8) -> bool {
+ self.quitset.map_or(false, |q| q.contains(byte))
+ }
+
+ /// Returns the cache capacity set on this configuration.
+ pub fn get_cache_capacity(&self) -> usize {
+ self.cache_capacity.unwrap_or(2 * (1 << 20))
+ }
+
+ /// Returns whether the cache capacity check should be skipped.
+ pub fn get_skip_cache_capacity_check(&self) -> bool {
+ self.skip_cache_capacity_check.unwrap_or(false)
+ }
+
+ /// Returns, if set, the minimum number of times the cache must be cleared
+ /// before a lazy DFA search can give up. When no minimum is set, then a
+ /// search will never quit and will always clear the cache whenever it
+ /// fills up.
+ pub fn get_minimum_cache_clear_count(&self) -> Option<usize> {
+ self.minimum_cache_clear_count.unwrap_or(None)
+ }
+
+ /// Returns the minimum lazy DFA cache capacity required for the given NFA.
+ ///
+ /// The cache capacity required for a particular NFA may change without
+ /// notice. Callers should not rely on it being stable.
+ ///
+ /// This is useful for informational purposes, but can also be useful for
+ /// other reasons. For example, if one wants to check the minimum cache
+ /// capacity themselves or if one wants to set the capacity based on the
+ /// minimum.
+ ///
+ /// This may return an error if this configuration does not support all of
+ /// the instructions used in the given NFA. For example, if the NFA has a
+ /// Unicode word boundary but this configuration does not enable heuristic
+ /// support for Unicode word boundaries.
+ pub fn get_minimum_cache_capacity(
+ &self,
+ nfa: &thompson::NFA,
+ ) -> Result<usize, BuildError> {
+ let quitset = self.quit_set_from_nfa(nfa)?;
+ let classes = self.byte_classes_from_nfa(nfa, &quitset);
+ let starts = self.get_starts_for_each_pattern();
+ Ok(minimum_cache_capacity(nfa, &classes, starts))
+ }
+
+ /// Returns the byte class map used during search from the given NFA.
+ ///
+ /// If byte classes are disabled on this configuration, then a map is
+ /// returned that puts each byte in its own equivalent class.
+ fn byte_classes_from_nfa(
+ &self,
+ nfa: &thompson::NFA,
+ quit: &ByteSet,
+ ) -> ByteClasses {
+ if !self.get_byte_classes() {
+ // The lazy DFA will always use the equivalence class map, but
+ // enabling this option is useful for debugging. Namely, this will
+ // cause all transitions to be defined over their actual bytes
+ // instead of an opaque equivalence class identifier. The former is
+ // much easier to grok as a human.
+ ByteClasses::singletons()
+ } else {
+ let mut set = nfa.byte_class_set().clone();
+ // It is important to distinguish any "quit" bytes from all other
+ // bytes. Otherwise, a non-quit byte may end up in the same class
+ // as a quit byte, and thus cause the DFA stop when it shouldn't.
+ if !quit.is_empty() {
+ set.add_set(&quit);
+ }
+ set.byte_classes()
+ }
+ }
+
+ /// Return the quit set for this configuration and the given NFA.
+ ///
+ /// This may return an error if the NFA is incompatible with this
+ /// configuration's quit set. For example, if the NFA has a Unicode word
+ /// boundary and the quit set doesn't include non-ASCII bytes.
+ fn quit_set_from_nfa(
+ &self,
+ nfa: &thompson::NFA,
+ ) -> Result<ByteSet, BuildError> {
+ let mut quit = self.quitset.unwrap_or(ByteSet::empty());
+ if nfa.has_word_boundary_unicode() {
+ if self.get_unicode_word_boundary() {
+ for b in 0x80..=0xFF {
+ quit.add(b);
+ }
+ } else {
+ // If heuristic support for Unicode word boundaries wasn't
+ // enabled, then we can still check if our quit set is correct.
+ // If the caller set their quit bytes in a way that causes the
+ // DFA to quit on at least all non-ASCII bytes, then that's all
+ // we need for heuristic support to work.
+ if !quit.contains_range(0x80, 0xFF) {
+ return Err(
+ BuildError::unsupported_dfa_word_boundary_unicode(),
+ );
+ }
+ }
+ }
+ Ok(quit)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ fn overwrite(self, o: Config) -> Config {
+ Config {
+ anchored: o.anchored.or(self.anchored),
+ match_kind: o.match_kind.or(self.match_kind),
+ starts_for_each_pattern: o
+ .starts_for_each_pattern
+ .or(self.starts_for_each_pattern),
+ byte_classes: o.byte_classes.or(self.byte_classes),
+ unicode_word_boundary: o
+ .unicode_word_boundary
+ .or(self.unicode_word_boundary),
+ quitset: o.quitset.or(self.quitset),
+ cache_capacity: o.cache_capacity.or(self.cache_capacity),
+ skip_cache_capacity_check: o
+ .skip_cache_capacity_check
+ .or(self.skip_cache_capacity_check),
+ minimum_cache_clear_count: o
+ .minimum_cache_clear_count
+ .or(self.minimum_cache_clear_count),
+ }
+ }
+}
+
+/// A builder for constructing a lazy deterministic finite automaton from
+/// regular expressions.
+///
+/// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The
+/// advantage of the former is that it often lets you avoid importing the
+/// `Builder` type directly.
+///
+/// This builder provides two main things:
+///
+/// 1. It provides a few different `build` routines for actually constructing
+/// a DFA from different kinds of inputs. The most convenient is
+/// [`Builder::build`], which builds a DFA directly from a pattern string. The
+/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
+/// from an NFA.
+/// 2. The builder permits configuring a number of things.
+/// [`Builder::configure`] is used with [`Config`] to configure aspects of
+/// the DFA and the construction process itself. [`Builder::syntax`] and
+/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
+/// construction, respectively. The syntax and thompson configurations only
+/// apply when building from a pattern string.
+///
+/// This builder always constructs a *single* lazy DFA. As such, this builder
+/// can only be used to construct regexes that either detect the presence
+/// of a match or find the end location of a match. A single DFA cannot
+/// produce both the start and end of a match. For that information, use a
+/// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured
+/// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason
+/// to use a DFA directly is if the end location of a match is enough for your
+/// use case. Namely, a `Regex` will construct two lazy DFAs instead of one,
+/// since a second reverse DFA is needed to find the start of a match.
+///
+/// # Example
+///
+/// This example shows how to build a lazy DFA that uses a tiny cache capacity
+/// and completely disables Unicode. That is:
+///
+/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
+/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
+/// (instead of any UTF-8 encoding of a Unicode scalar value except for
+/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
+/// * The pattern itself is permitted to match invalid UTF-8. For example,
+/// things like `[^a]` that match any byte except for `a` are permitted.
+/// * Unanchored patterns can search through invalid UTF-8. That is, for
+/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of
+/// `(?s:.)*?`.
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::dfa::DFA,
+/// nfa::thompson,
+/// HalfMatch, SyntaxConfig,
+/// };
+///
+/// let dfa = DFA::builder()
+/// .configure(DFA::config().cache_capacity(5_000))
+/// .syntax(SyntaxConfig::new().unicode(false).utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo[^b]ar.*")?;
+/// let mut cache = dfa.create_cache();
+///
+/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
+/// let expected = Some(HalfMatch::must(0, 10));
+/// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ thompson: thompson::Builder,
+}
+
+impl Builder {
+ /// Create a new lazy DFA builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder {
+ config: Config::default(),
+ thompson: thompson::Builder::new(),
+ }
+ }
+
+ /// Build a lazy DFA from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build(&self, pattern: &str) -> Result<DFA, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a lazy DFA from the given patterns.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<DFA, BuildError> {
+ let nfa =
+ self.thompson.build_many(patterns).map_err(BuildError::nfa)?;
+ self.build_from_nfa(Arc::new(nfa))
+ }
+
+ /// Build a DFA from the given NFA.
+ ///
+ /// Note that this requires an `Arc<thompson::NFA>` instead of a
+ /// `&thompson::NFA` because the lazy DFA builds itself from the NFA at
+ /// search time. This means that the lazy DFA must hold on to its source
+ /// NFA for the entirety of its lifetime. An `Arc` is used so that callers
+ /// aren't forced to clone the NFA if it is needed elsewhere.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a lazy DFA if you already have an NFA
+ /// in hand.
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ /// use regex_automata::{hybrid::dfa::DFA, nfa::thompson, HalfMatch};
+ ///
+ /// let haystack = "foo123bar".as_bytes();
+ ///
+ /// // This shows how to set non-default options for building an NFA.
+ /// let nfa = thompson::Builder::new()
+ /// .configure(thompson::Config::new().shrink(false))
+ /// .build(r"[0-9]+")?;
+ /// let dfa = DFA::builder().build_from_nfa(Arc::new(nfa))?;
+ /// let mut cache = dfa.create_cache();
+ /// let expected = Some(HalfMatch::must(0, 6));
+ /// let got = dfa.find_leftmost_fwd(&mut cache, haystack)?;
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_nfa(
+ &self,
+ nfa: Arc<thompson::NFA>,
+ ) -> Result<DFA, BuildError> {
+ let quitset = self.config.quit_set_from_nfa(&nfa)?;
+ let classes = self.config.byte_classes_from_nfa(&nfa, &quitset);
+ // Check that we can fit at least a few states into our cache,
+ // otherwise it's pretty senseless to use the lazy DFA. This does have
+ // a possible failure mode though. This assumes the maximum size of a
+ // state in powerset space (so, the total number of NFA states), which
+ // may never actually materialize, and could be quite a bit larger
+ // than the actual biggest state. If this turns out to be a problem,
+ // we could expose a knob that disables this check. But if so, we have
+ // to be careful not to panic in other areas of the code (the cache
+ // clearing and init code) that tend to assume some minimum useful
+ // cache capacity.
+ let min_cache = minimum_cache_capacity(
+ &nfa,
+ &classes,
+ self.config.get_starts_for_each_pattern(),
+ );
+ let mut cache_capacity = self.config.get_cache_capacity();
+ if cache_capacity < min_cache {
+ // When the caller has asked us to skip the cache capacity check,
+ // then we simply force the cache capacity to its minimum amount
+ // and mush on.
+ if self.config.get_skip_cache_capacity_check() {
+ trace!(
+ "given capacity ({}) is too small, \
+ since skip_cache_capacity_check is enabled, \
+ setting cache capacity to minimum ({})",
+ cache_capacity,
+ min_cache,
+ );
+ cache_capacity = min_cache;
+ } else {
+ return Err(BuildError::insufficient_cache_capacity(
+ min_cache,
+ cache_capacity,
+ ));
+ }
+ }
+ // We also need to check that we can fit at least some small number
+ // of states in our state ID space. This is unlikely to trigger in
+ // >=32-bit systems, but 16-bit systems have a pretty small state ID
+ // space since a number of bits are used up as sentinels.
+ if let Err(err) = minimum_lazy_state_id(&nfa, &classes) {
+ return Err(BuildError::insufficient_state_id_capacity(err));
+ }
+ let stride2 = classes.stride2();
+ Ok(DFA {
+ nfa,
+ stride2,
+ classes,
+ quitset,
+ anchored: self.config.get_anchored(),
+ match_kind: self.config.get_match_kind(),
+ starts_for_each_pattern: self.config.get_starts_for_each_pattern(),
+ cache_capacity,
+ minimum_cache_clear_count: self
+ .config
+ .get_minimum_cache_clear_count(),
+ })
+ }
+
+ /// Apply the given lazy DFA configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](crate::SyntaxConfig).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// These settings only apply when constructing a lazy DFA directly from a
+ /// pattern.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ self.thompson.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
+ ///
+ /// This permits setting things like whether the DFA should match the regex
+ /// in reverse or if additional time should be spent shrinking the size of
+ /// the NFA.
+ ///
+ /// These settings only apply when constructing a DFA directly from a
+ /// pattern.
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.thompson.configure(config);
+ self
+ }
+}
+
+/// Based on the minimum number of states required for a useful lazy DFA cache,
+/// this returns the minimum lazy state ID that must be representable.
+///
+/// It's likely not plausible for this to impose constraints on 32-bit systems
+/// (or higher), but on 16-bit systems, the lazy state ID space is quite
+/// constrained and thus may be insufficient for bigger regexes.
+fn minimum_lazy_state_id(
+ nfa: &thompson::NFA,
+ classes: &ByteClasses,
+) -> Result<LazyStateID, LazyStateIDError> {
+ let stride = 1 << classes.stride2();
+ let min_state_index = MIN_STATES.checked_sub(1).unwrap();
+ LazyStateID::new(min_state_index * stride)
+}
+
+/// Based on the minimum number of states required for a useful lazy DFA cache,
+/// this returns a heuristic minimum number of bytes of heap space required.
+///
+/// This is a "heuristic" because the minimum it returns is likely bigger than
+/// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses
+/// the maximum number of NFA states (all of them). This is likely bigger
+/// than what is required in practice. Computing the true minimum effectively
+/// requires determinization, which is probably too much work to do for a
+/// simple check like this.
+fn minimum_cache_capacity(
+ nfa: &thompson::NFA,
+ classes: &ByteClasses,
+ starts_for_each_pattern: bool,
+) -> usize {
+ const ID_SIZE: usize = size_of::<LazyStateID>();
+ let stride = 1 << classes.stride2();
+
+ let sparses = 2 * nfa.len() * NFAStateID::SIZE;
+ let trans = MIN_STATES * stride * ID_SIZE;
+
+ let mut starts = Start::count() * ID_SIZE;
+ if starts_for_each_pattern {
+ starts += (Start::count() * nfa.pattern_len()) * ID_SIZE;
+ }
+
+ // Every `State` has three bytes for flags, 4 bytes (max) for the number
+ // of patterns, followed by 32-bit encodings of patterns and then delta
+ // varint encodings of NFA state IDs. We use the worst case (which isn't
+ // technically possible) of 5 bytes for each NFA state ID.
+ //
+ // HOWEVER, three of the states needed by a lazy DFA are just the sentinel
+ // unknown, dead and quit states. Those states have a known size and it is
+ // small.
+ assert!(MIN_STATES >= 3, "minimum number of states has to be at least 3");
+ let dead_state_size = State::dead().memory_usage();
+ let max_state_size = 3 + 4 + (nfa.pattern_len() * 4) + (nfa.len() * 5);
+ let states = (3 * (size_of::<State>() + dead_state_size))
+ + ((MIN_STATES - 3) * (size_of::<State>() + max_state_size));
+ let states_to_sid = states + (MIN_STATES * ID_SIZE);
+ let stack = nfa.len() * NFAStateID::SIZE;
+ let scratch_state_builder = max_state_size;
+
+ trans
+ + starts
+ + states
+ + states_to_sid
+ + sparses
+ + stack
+ + scratch_state_builder
+}
diff --git a/vendor/regex-automata/src/hybrid/error.rs b/vendor/regex-automata/src/hybrid/error.rs
new file mode 100644
index 000000000..715da39bd
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/error.rs
@@ -0,0 +1,130 @@
+use crate::{hybrid::id::LazyStateIDError, nfa};
+
+/// An error that occurs when initial construction of a lazy DFA fails.
+///
+/// A build error can occur when insufficient cache capacity is configured or
+/// if something about the NFA is unsupported. (For example, if one attempts
+/// to build a lazy DFA without heuristic Unicode support but with an NFA that
+/// contains a Unicode word boundary.)
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct BuildError {
+ kind: BuildErrorKind,
+}
+
+#[derive(Clone, Debug)]
+enum BuildErrorKind {
+ NFA(nfa::thompson::Error),
+ InsufficientCacheCapacity { minimum: usize, given: usize },
+ InsufficientStateIDCapacity { err: LazyStateIDError },
+ Unsupported(&'static str),
+}
+
+impl BuildError {
+ fn kind(&self) -> &BuildErrorKind {
+ &self.kind
+ }
+
+ pub(crate) fn nfa(err: nfa::thompson::Error) -> BuildError {
+ BuildError { kind: BuildErrorKind::NFA(err) }
+ }
+
+ pub(crate) fn insufficient_cache_capacity(
+ minimum: usize,
+ given: usize,
+ ) -> BuildError {
+ BuildError {
+ kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given },
+ }
+ }
+
+ pub(crate) fn insufficient_state_id_capacity(
+ err: LazyStateIDError,
+ ) -> BuildError {
+ BuildError {
+ kind: BuildErrorKind::InsufficientStateIDCapacity { err },
+ }
+ }
+
+ pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
+ let msg = "cannot build lazy DFAs for regexes with Unicode word \
+ boundaries; switch to ASCII word boundaries, or \
+ heuristically enable Unicode word boundaries or use a \
+ different regex engine";
+ BuildError { kind: BuildErrorKind::Unsupported(msg) }
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for BuildError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ match self.kind() {
+ BuildErrorKind::NFA(ref err) => Some(err),
+ BuildErrorKind::InsufficientCacheCapacity { .. } => None,
+ // LazyStateIDError is an implementation detail, don't expose it.
+ BuildErrorKind::InsufficientStateIDCapacity { .. } => None,
+ BuildErrorKind::Unsupported(_) => None,
+ }
+ }
+}
+
+impl core::fmt::Display for BuildError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ match self.kind() {
+ BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
+ BuildErrorKind::InsufficientCacheCapacity { minimum, given } => {
+ write!(
+ f,
+ "given cache capacity ({}) is smaller than \
+ minimum required ({})",
+ given, minimum,
+ )
+ }
+ BuildErrorKind::InsufficientStateIDCapacity { ref err } => {
+ err.fmt(f)
+ }
+ BuildErrorKind::Unsupported(ref msg) => {
+ write!(f, "unsupported regex feature for DFAs: {}", msg)
+ }
+ }
+ }
+}
+
+/// An error that occurs when cache usage has become inefficient.
+///
+/// One of the weaknesses of a lazy DFA is that it may need to clear its
+/// cache repeatedly if it's not big enough. If this happens too much, then it
+/// can slow searching down significantly. A mitigation to this is to use
+/// heuristics to detect whether the cache is being used efficiently or not.
+/// If not, then a lazy DFA can return a `CacheError`.
+///
+/// The default configuration of a lazy DFA in this crate is
+/// set such that a `CacheError` will never occur. Instead,
+/// callers must opt into this behavior with settings like
+/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count).
+///
+/// When the `std` feature is enabled, this implements the `std::error::Error`
+/// trait.
+#[derive(Clone, Debug)]
+pub struct CacheError(());
+
+impl CacheError {
+ pub(crate) fn too_many_cache_clears() -> CacheError {
+ CacheError(())
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for CacheError {
+ fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+ None
+ }
+}
+
+impl core::fmt::Display for CacheError {
+ fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+ write!(f, "lazy DFA cache has been cleared too many times")
+ }
+}
diff --git a/vendor/regex-automata/src/hybrid/id.rs b/vendor/regex-automata/src/hybrid/id.rs
new file mode 100644
index 000000000..a6fcde52e
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/id.rs
@@ -0,0 +1,415 @@
+/// A state identifier especially tailored for lazy DFAs.
+///
+/// A lazy state ID logically represents a pointer to a DFA state. In practice,
+/// by limiting the number of DFA states it can address, it reserves some
+/// bits of its representation to encode some additional information. That
+/// additional information is called a "tag." That tag is used to record
+/// whether the state it points to is an unknown, dead, quit, start or match
+/// state.
+///
+/// When implementing a low level search routine with a lazy DFA, it is
+/// necessary to query the type of the current state to know what to do:
+///
+/// * **Unknown** - The state has not yet been computed. The
+/// parameters used to get this state ID must be re-passed to
+/// [`DFA::next_state`](crate::hybrid::dfa::DFA), which will never return an
+/// unknown state ID.
+/// * **Dead** - A dead state only has transitions to itself. It indicates that
+/// the search cannot do anything else and should stop with whatever result it
+/// has.
+/// * **Quit** - A quit state indicates that the automaton could not answer
+/// whether a match exists or not. Correct search implementations must return a
+/// [`MatchError::Quit`](crate::MatchError::Quit).
+/// * **Start** - A start state indicates that the automaton will begin
+/// searching at a starting state. Branching on this isn't required for
+/// correctness, but a common optimization is to use this to more quickly look
+/// for a prefix.
+/// * **Match** - A match state indicates that a match has been found.
+/// Depending on the semantics of your search implementation, it may either
+/// continue until the end of the haystack or a dead state, or it might quit
+/// and return the match immediately.
+///
+/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
+/// can be used to determine if a tag exists at all. This is useful to avoid
+/// branching on all of the above types for every byte searched.
+///
+/// # Example
+///
+/// This example shows how `LazyStateID` can be used to implement a correct
+/// search routine with minimal branching. In particular, this search routine
+/// implements "leftmost" matching, which means that it doesn't immediately
+/// stop once a match is found. Instead, it continues until it reaches a dead
+/// state.
+///
+/// Notice also how a correct search implementation deals with
+/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
+/// the lazy DFA routines. When a `CacheError` occurs, it returns
+/// [`MatchError::GaveUp`](crate::MatchError::GaveUp).
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::dfa::{Cache, DFA},
+/// HalfMatch, MatchError, PatternID,
+/// };
+///
+/// fn find_leftmost_first(
+/// dfa: &DFA,
+/// cache: &mut Cache,
+/// haystack: &[u8],
+/// ) -> Result<Option<HalfMatch>, MatchError> {
+/// // The start state is determined by inspecting the position and the
+/// // initial bytes of the haystack. Note that start states can never
+/// // be match states (since DFAs in this crate delay matches by 1
+/// // byte), so we don't need to check if the start state is a match.
+/// let mut sid = dfa.start_state_forward(
+/// cache, None, haystack, 0, haystack.len(),
+/// ).map_err(|_| MatchError::GaveUp { offset: 0 })?;
+/// let mut last_match = None;
+/// // Walk all the bytes in the haystack. We can quit early if we see
+/// // a dead or a quit state. The former means the automaton will
+/// // never transition to any other state. The latter means that the
+/// // automaton entered a condition in which its search failed.
+/// for (i, &b) in haystack.iter().enumerate() {
+/// sid = dfa
+/// .next_state(cache, sid, b)
+/// .map_err(|_| MatchError::GaveUp { offset: i })?;
+/// if sid.is_tagged() {
+/// if sid.is_match() {
+/// last_match = Some(HalfMatch::new(
+/// dfa.match_pattern(cache, sid, 0),
+/// i,
+/// ));
+/// } else if sid.is_dead() {
+/// return Ok(last_match);
+/// } else if sid.is_quit() {
+/// // It is possible to enter into a quit state after
+/// // observing a match has occurred. In that case, we
+/// // should return the match instead of an error.
+/// if last_match.is_some() {
+/// return Ok(last_match);
+/// }
+/// return Err(MatchError::Quit { byte: b, offset: i });
+/// }
+/// // Implementors may also want to check for start states and
+/// // handle them differently for performance reasons. But it is
+/// // not necessary for correctness.
+/// }
+/// }
+/// // Matches are always delayed by 1 byte, so we must explicitly walk
+/// // the special "EOI" transition at the end of the search.
+/// sid = dfa
+/// .next_eoi_state(cache, sid)
+/// .map_err(|_| MatchError::GaveUp { offset: haystack.len() })?;
+/// if sid.is_match() {
+/// last_match = Some(HalfMatch::new(
+/// dfa.match_pattern(cache, sid, 0),
+/// haystack.len(),
+/// ));
+/// }
+/// Ok(last_match)
+/// }
+///
+/// // We use a greedy '+' operator to show how the search doesn't just stop
+/// // once a match is detected. It continues extending the match. Using
+/// // '[a-z]+?' would also work as expected and stop the search early.
+/// // Greediness is built into the automaton.
+/// let dfa = DFA::new(r"[a-z]+")?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 10);
+///
+/// // Here's another example that tests our handling of the special
+/// // EOI transition. This will fail to find a match if we don't call
+/// // 'next_eoi_state' at the end of the search since the match isn't found
+/// // until the final byte in the haystack.
+/// let dfa = DFA::new(r"[0-9]{4}")?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 15);
+///
+/// // And note that our search implementation above automatically works
+/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
+/// // the appropriate pattern ID for us.
+/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
+/// let mut cache = dfa.create_cache();
+/// let haystack = "123 foobar 4567".as_bytes();
+/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 1);
+/// assert_eq!(mat.offset(), 3);
+/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 0);
+/// assert_eq!(mat.offset(), 7);
+/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
+/// assert_eq!(mat.pattern().as_usize(), 1);
+/// assert_eq!(mat.offset(), 5);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(
+ Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
+)]
+pub struct LazyStateID(u32);
+
+impl LazyStateID {
+ #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+ const MAX_BIT: usize = 31;
+
+ #[cfg(target_pointer_width = "16")]
+ const MAX_BIT: usize = 15;
+
+ const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
+ const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
+ const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
+ const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
+ const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
+ const MAX: usize = LazyStateID::MASK_MATCH - 1;
+
+ /// Create a new lazy state ID.
+ ///
+ /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
+ /// an error.
+ #[inline]
+ pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
+ if id > LazyStateID::MAX {
+ return Err(LazyStateIDError { attempted: id as u64 });
+ }
+ Ok(LazyStateID::new_unchecked(id))
+ }
+
+ /// Create a new lazy state ID without checking whether the given value
+ /// exceeds [`LazyStateID::MAX`].
+ ///
+ /// While this is unchecked, providing an incorrect value must never
+ /// sacrifice memory safety.
+ #[inline]
+ const fn new_unchecked(id: usize) -> LazyStateID {
+ LazyStateID(id as u32)
+ }
+
+ /// Return this lazy state ID as its raw value if and only if it is not
+ /// tagged (and thus not an unknown, dead, quit, start or match state ID).
+ #[inline]
+ pub(crate) fn as_usize(&self) -> Option<usize> {
+ if self.is_tagged() {
+ None
+ } else {
+ Some(self.as_usize_unchecked())
+ }
+ }
+
+ /// Return this lazy state ID as an untagged `usize`.
+ ///
+ /// If this lazy state ID is tagged, then the usize returned is the state
+ /// ID without the tag. If the ID was not tagged, then the usize returned
+ /// is equivalent to the state ID.
+ #[inline]
+ pub(crate) fn as_usize_untagged(&self) -> usize {
+ self.as_usize_unchecked() & LazyStateID::MAX
+ }
+
+ /// Return this lazy state ID as its raw internal `usize` value, which may
+ /// be tagged (and thus greater than LazyStateID::MAX).
+ #[inline]
+ pub(crate) const fn as_usize_unchecked(&self) -> usize {
+ self.0 as usize
+ }
+
+ #[inline]
+ pub(crate) const fn to_unknown(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
+ )
+ }
+
+ #[inline]
+ pub(crate) const fn to_dead(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
+ )
+ }
+
+ #[inline]
+ pub(crate) const fn to_quit(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
+ )
+ }
+
+ /// Return this lazy state ID as a state ID that is tagged as a start
+ /// state.
+ #[inline]
+ pub(crate) const fn to_start(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_START,
+ )
+ }
+
+ /// Return this lazy state ID as a lazy state ID that is tagged as a match
+ /// state.
+ #[inline]
+ pub(crate) const fn to_match(&self) -> LazyStateID {
+ LazyStateID::new_unchecked(
+ self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
+ )
+ }
+
+ /// Return true if and only if this lazy state ID is tagged.
+ ///
+ /// When a lazy state ID is tagged, then one can conclude that it is one
+ /// of a match, start, dead, quit or unknown state.
+ #[inline]
+ pub const fn is_tagged(&self) -> bool {
+ self.as_usize_unchecked() > LazyStateID::MAX
+ }
+
+ /// Return true if and only if this represents a lazy state ID that is
+ /// "unknown." That is, the state has not yet been created. When a caller
+ /// sees this state ID, it generally means that a state has to be computed
+ /// in order to proceed.
+ #[inline]
+ pub const fn is_unknown(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
+ }
+
+ /// Return true if and only if this represents a dead state. A dead state
+ /// is a state that can never transition to any other state except the
+ /// dead state. When a dead state is seen, it generally indicates that a
+ /// search should stop.
+ #[inline]
+ pub const fn is_dead(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
+ }
+
+ /// Return true if and only if this represents a quit state. A quit state
+ /// is a state that is representationally equivalent to a dead state,
+ /// except it indicates the automaton has reached a point at which it can
+ /// no longer determine whether a match exists or not. In general, this
+ /// indicates an error during search and the caller must either pass this
+ /// error up or use a different search technique.
+ #[inline]
+ pub const fn is_quit(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
+ }
+
+ /// Return true if and only if this lazy state ID has been tagged as a
+ /// start state.
+ #[inline]
+ pub const fn is_start(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_START > 0
+ }
+
+ /// Return true if and only if this lazy state ID has been tagged as a
+ /// match state.
+ #[inline]
+ pub const fn is_match(&self) -> bool {
+ self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
+ }
+}
+
+/// This error occurs when a lazy state ID could not be constructed.
+///
+/// This occurs when given an integer exceeding the maximum lazy state ID
+/// value.
+///
+/// When the `std` feature is enabled, this implements the `Error` trait.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub(crate) struct LazyStateIDError {
+ attempted: u64,
+}
+
+impl LazyStateIDError {
+ /// Returns the value that failed to constructed a lazy state ID.
+ pub(crate) fn attempted(&self) -> u64 {
+ self.attempted
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for LazyStateIDError {}
+
+impl core::fmt::Display for LazyStateIDError {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "failed to create LazyStateID from {:?}, which exceeds {:?}",
+ self.attempted(),
+ LazyStateID::MAX,
+ )
+ }
+}
+
+/// Represents the current state of an overlapping search.
+///
+/// This is used for overlapping searches since they need to know something
+/// about the previous search. For example, when multiple patterns match at the
+/// same position, this state tracks the last reported pattern so that the next
+/// search knows whether to report another matching pattern or continue with
+/// the search at the next position. Additionally, it also tracks which state
+/// the last search call terminated in.
+///
+/// This type provides no introspection capabilities. The only thing a caller
+/// can do is construct it and pass it around to permit search routines to use
+/// it to track state.
+///
+/// Callers should always provide a fresh state constructed via
+/// [`OverlappingState::start`] when starting a new search. Reusing state from
+/// a previous search may result in incorrect results.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct OverlappingState {
+ /// The state ID of the state at which the search was in when the call
+ /// terminated. When this is a match state, `last_match` must be set to a
+ /// non-None value.
+ ///
+ /// A `None` value indicates the start state of the corresponding
+ /// automaton. We cannot use the actual ID, since any one automaton may
+ /// have many start states, and which one is in use depends on several
+ /// search-time factors.
+ id: Option<LazyStateID>,
+ /// Information associated with a match when `id` corresponds to a match
+ /// state.
+ last_match: Option<StateMatch>,
+}
+
+/// Internal state about the last match that occurred. This records both the
+/// offset of the match and the match index.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+pub(crate) struct StateMatch {
+ /// The index into the matching patterns for the current match state.
+ pub(crate) match_index: usize,
+ /// The offset in the haystack at which the match occurred. This is used
+ /// when reporting multiple matches at the same offset. That is, when
+ /// an overlapping search runs, the first thing it checks is whether it's
+ /// already in a match state, and if so, whether there are more patterns
+ /// to report as matches in that state. If so, it increments `match_index`
+ /// and returns the pattern and this offset. Once `match_index` exceeds the
+ /// number of matching patterns in the current state, the search continues.
+ pub(crate) offset: usize,
+}
+
+impl OverlappingState {
+ /// Create a new overlapping state that begins at the start state of any
+ /// automaton.
+ pub fn start() -> OverlappingState {
+ OverlappingState { id: None, last_match: None }
+ }
+
+ pub(crate) fn id(&self) -> Option<LazyStateID> {
+ self.id
+ }
+
+ pub(crate) fn set_id(&mut self, id: LazyStateID) {
+ self.id = Some(id);
+ }
+
+ pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
+ self.last_match.as_mut()
+ }
+
+ pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
+ self.last_match = Some(last_match);
+ }
+}
diff --git a/vendor/regex-automata/src/hybrid/mod.rs b/vendor/regex-automata/src/hybrid/mod.rs
new file mode 100644
index 000000000..4c8ca7ebe
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/mod.rs
@@ -0,0 +1,179 @@
+/*!
+A module for building and searching with lazy determinstic finite automata
+(DFAs).
+
+Like other modules in this crate, lazy DFAs support a rich regex syntax with
+Unicode features. The key feature of a lazy DFA is that it builds itself
+incrementally during search, and never uses more than a configured capacity of
+memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache"
+in which the actual DFA's transition table is stored.
+
+If you're looking for fully compiled DFAs, then please see the top-level
+[`dfa` module](crate::dfa).
+
+# Overview
+
+This section gives a brief overview of the primary types in this module:
+
+* A [`regex::Regex`] provides a way to search for matches of a regular
+expression using lazy DFAs. This includes iterating over matches with both the
+start and end positions of each match.
+* A [`dfa::DFA`] provides direct low level access to a lazy DFA.
+
+# Example: basic regex searching
+
+This example shows how to compile a regex using the default configuration
+and then use it to find matches in a byte string:
+
+```
+use regex_automata::{hybrid::regex::Regex, MultiMatch};
+
+let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
+let mut cache = re.create_cache();
+
+let text = b"2018-12-24 2016-10-08";
+let matches: Vec<MultiMatch> =
+ re.find_leftmost_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(0, 0, 10),
+ MultiMatch::must(0, 11, 21),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# Example: searching with regex sets
+
+The lazy DFAs in this module all fully support searching with multiple regexes
+simultaneously. You can use this support with standard leftmost-first style
+searching to find non-overlapping matches:
+
+```
+use regex_automata::{hybrid::regex::Regex, MultiMatch};
+
+let re = Regex::new_many(&[r"\w+", r"\S+"])?;
+let mut cache = re.create_cache();
+
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> =
+ re.find_leftmost_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(1, 0, 4),
+ MultiMatch::must(0, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+Or use overlapping style searches to find all possible occurrences:
+
+```
+use regex_automata::{hybrid::{dfa, regex::Regex}, MatchKind, MultiMatch};
+
+// N.B. For overlapping searches, we need the underlying lazy DFA to report all
+// possible matches.
+let re = Regex::builder()
+ .dfa(dfa::Config::new().match_kind(MatchKind::All))
+ .build_many(&[r"\w{3}", r"\S{3}"])?;
+let mut cache = re.create_cache();
+
+let text = b"@foo bar";
+let matches: Vec<MultiMatch> =
+ re.find_overlapping_iter(&mut cache, text).collect();
+assert_eq!(matches, vec![
+ MultiMatch::must(1, 0, 3),
+ MultiMatch::must(0, 1, 4),
+ MultiMatch::must(1, 1, 4),
+ MultiMatch::must(0, 5, 8),
+ MultiMatch::must(1, 5, 8),
+]);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+# When should I use this?
+
+Generally speaking, if you can abide the use of mutable state during search,
+and you don't need things like capturing groups or Unicode word boundary
+support in non-ASCII text, then a lazy DFA is likely a robust choice with
+respect to both search speed and memory usage. Note however that its speed
+may be worse than a general purpose regex engine if you don't select a good
+[prefilter](crate::util::prefilter).
+
+If you know ahead of time that your pattern would result in a very large DFA
+if it was fully compiled, it may be better to use an NFA simulation instead
+of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA
+to something that is big enough to hold the state machine (likely through
+experimentation). The issue here is that if the cache is too small, then it
+could wind up being reset too frequently and this might decrease searching
+speed significantly.
+
+# Differences with fully compiled DFAs
+
+A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a
+[`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities
+(and similarly for their underlying DFAs), but they achieve them through
+different means. The main difference is that a hybrid or "lazy" regex builds
+its DFA lazily during search, where as a fully compiled regex will build its
+DFA at construction time. While building a DFA at search time might sound like
+it's slow, it tends to work out where most bytes seen during a search will
+reuse pre-built parts of the DFA and thus can be almost as fast as a fully
+compiled DFA. The main downside is that searching requires mutable space to
+store the DFA, and, in the worst case, a search can result in a new state being
+created for each byte seen, which would make searching quite a bit slower.
+
+A fully compiled DFA never has to worry about searches being slower once
+it's built. (Aside from, say, the transition table being so large that it
+is subject to harsh CPU cache effects.) However, of course, building a full
+DFA can be quite time consuming and memory hungry. Particularly when it's
+so easy to build large DFAs when Unicode mode is enabled.
+
+A lazy DFA strikes a nice balance _in practice_, particularly in the
+presence of Unicode mode, by only building what is needed. It avoids the
+worst case exponential time complexity of DFA compilation by guaranteeing that
+it will only build at most one state per byte searched. While the worst
+case here can lead to a very high constant, it will never be exponential.
+
+# Syntax
+
+This module supports the same syntax as the `regex` crate, since they share the
+same parser. You can find an exhaustive list of supported syntax in the
+[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
+
+There are two things that are not supported by the lazy DFAs in this module:
+
+* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
+of them) can only find the offsets of an entire match, but cannot resolve
+the offsets of each capturing group. This is because DFAs do not have the
+expressive power necessary.
+* Unicode word boundaries. These present particularly difficult challenges for
+DFA construction and would result in an explosion in the number of states.
+One can enable [`dfa::Config::unicode_word_boundary`] though, which provides
+heuristic support for Unicode word boundaries that only works on ASCII text.
+Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
+on any input.
+
+There are no plans to lift either of these limitations.
+
+Note that these restrictions are identical to the restrictions on fully
+compiled DFAs.
+
+# Support for `alloc`-only
+
+This crate comes with `alloc` and `std` features that are enabled by default.
+One can disable the `std` feature and still use the full API of a lazy DFA.
+(You should use `std` when possible, since it permits providing implementations
+of the `std::error::Error` trait, and does enable some minor internal
+optimizations.)
+
+This module does require at least the `alloc` feature though. It is not
+available in any capacity without `alloc`.
+*/
+
+pub use self::{
+ error::{BuildError, CacheError},
+ id::{LazyStateID, OverlappingState},
+};
+
+pub mod dfa;
+mod error;
+mod id;
+pub mod regex;
+mod search;
diff --git a/vendor/regex-automata/src/hybrid/regex.rs b/vendor/regex-automata/src/hybrid/regex.rs
new file mode 100644
index 000000000..7cc6b9064
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/regex.rs
@@ -0,0 +1,2124 @@
+/*!
+A lazy DFA backed `Regex`.
+
+This module provides [`Regex`] using lazy DFA. A `Regex` implements convenience
+routines you might have come to expect, such as finding a match and iterating
+over all non-overlapping matches. This `Regex` type is limited in its
+capabilities to what a lazy DFA can provide. Therefore, APIs involving
+capturing groups, for example, are not provided.
+
+Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
+finds the end offset of a match, where as the other is a "reverse" DFA that
+find the start offset of a match.
+
+See the [parent module](crate::hybrid) for examples.
+*/
+
+use core::borrow::Borrow;
+
+use alloc::boxed::Box;
+
+use crate::{
+ hybrid::{
+ dfa::{self, DFA},
+ error::BuildError,
+ OverlappingState,
+ },
+ nfa::thompson,
+ util::{
+ matchtypes::{MatchError, MatchKind, MultiMatch},
+ prefilter::{self, Prefilter},
+ },
+};
+
+/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")
+/// for searching.
+///
+/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a
+/// "reverse" DFA. The forward DFA is responsible for detecting the end of
+/// a match while the reverse DFA is responsible for detecting the start
+/// of a match. Thus, in order to find the bounds of any given match, a
+/// forward search must first be run followed by a reverse search. A match
+/// found by the forward DFA guarantees that the reverse DFA will also find
+/// a match.
+///
+/// A `Regex` can also have a prefilter set via the
+/// [`set_prefilter`](Regex::set_prefilter) method. By default, no prefilter is
+/// enabled.
+///
+/// # Earliest vs Leftmost vs Overlapping
+///
+/// The search routines exposed on a `Regex` reflect three different ways
+/// of searching:
+///
+/// * "earliest" means to stop as soon as a match has been detected.
+/// * "leftmost" means to continue matching until the underlying
+/// automaton cannot advance. This reflects "standard" searching you
+/// might be used to in other regex engines. e.g., This permits
+/// non-greedy and greedy searching to work as you would expect.
+/// * "overlapping" means to find all possible matches, even if they
+/// overlap.
+///
+/// Generally speaking, when doing an overlapping search, you'll want to
+/// build your regex lazy DFAs with [`MatchKind::All`] semantics. Using
+/// [`MatchKind::LeftmostFirst`] semantics with overlapping searches is
+/// likely to lead to odd behavior since `LeftmostFirst` specifically omits
+/// some matches that can never be reported due to its semantics.
+///
+/// The following example shows the differences between how these different
+/// types of searches impact looking for matches of `[a-z]+` in the
+/// haystack `abc`.
+///
+/// ```
+/// use regex_automata::{hybrid::{dfa, regex}, MatchKind, MultiMatch};
+///
+/// let pattern = r"[a-z]+";
+/// let haystack = "abc".as_bytes();
+///
+/// // With leftmost-first semantics, we test "earliest" and "leftmost".
+/// let re = regex::Builder::new()
+/// .dfa(dfa::Config::new().match_kind(MatchKind::LeftmostFirst))
+/// .build(pattern)?;
+/// let mut cache = re.create_cache();
+///
+/// // "earliest" searching isn't impacted by greediness
+/// let mut it = re.find_earliest_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// // "leftmost" searching supports greediness (and non-greediness)
+/// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// // For overlapping, we want "all" match kind semantics.
+/// let re = regex::Builder::new()
+/// .dfa(dfa::Config::new().match_kind(MatchKind::All))
+/// .build(pattern)?;
+/// let mut cache = re.create_cache();
+///
+/// // In the overlapping search, we find all three possible matches
+/// // starting at the beginning of the haystack.
+/// let mut it = re.find_overlapping_iter(&mut cache, haystack);
+/// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 0, 2)), it.next());
+/// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+/// assert_eq!(None, it.next());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Fallibility
+///
+/// In non-default configurations, the lazy DFAs generated in this module may
+/// return an error during a search. (Currently, the only way this happens is
+/// if quit bytes are added, Unicode word boundaries are heuristically enabled,
+/// or if the cache is configured to "give up" on a search if it has been
+/// cleared too many times. All of these are turned off by default, which means
+/// a search can never fail in the default configuration.) For convenience,
+/// the main search routines, like [`find_leftmost`](Regex::find_leftmost),
+/// will panic if an error occurs. However, if you need to use DFAs which may
+/// produce an error at search time, then there are fallible equivalents of
+/// all search routines. For example, for `find_leftmost`, its fallible analog
+/// is [`try_find_leftmost`](Regex::try_find_leftmost). The routines prefixed
+/// with `try_` return `Result<Option<MultiMatch>, MatchError>`, where as the
+/// infallible routines simply return `Option<MultiMatch>`.
+///
+/// # Example
+///
+/// This example shows how to cause a search to terminate if it sees a
+/// `\n` byte, and handle the error returned. This could be useful if, for
+/// example, you wanted to prevent a user supplied pattern from matching
+/// across a line boundary.
+///
+/// ```
+/// use regex_automata::{hybrid::{dfa, regex::Regex}, MatchError};
+///
+/// let re = Regex::builder()
+/// .dfa(dfa::Config::new().quit(b'\n', true))
+/// .build(r"foo\p{any}+bar")?;
+/// let mut cache = re.create_cache();
+///
+/// let haystack = "foo\nbar".as_bytes();
+/// // Normally this would produce a match, since \p{any} contains '\n'.
+/// // But since we instructed the automaton to enter a quit state if a
+/// // '\n' is observed, this produces a match error instead.
+/// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
+/// let got = re.try_find_leftmost(&mut cache, haystack).unwrap_err();
+/// assert_eq!(expected, got);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Debug)]
+pub struct Regex {
+ /// An optional prefilter that is passed down to the lazy DFA search
+ /// routines when present. By default, no prefilter is set.
+ pre: Option<Box<dyn Prefilter>>,
+ /// The forward lazy DFA. This can only find the end of a match.
+ forward: DFA,
+ /// The reverse lazy DFA. This can only find the start of a match.
+ ///
+ /// This is built with 'all' match semantics (instead of leftmost-first)
+ /// so that it always finds the longest possible match (which corresponds
+ /// to the leftmost starting position). It is also compiled as an anchored
+ /// matcher and has 'starts_for_each_pattern' enabled. Including starting
+ /// states for each pattern is necessary to ensure that we only look for
+ /// matches of a pattern that matched in the forward direction. Otherwise,
+ /// we might wind up finding the "leftmost" starting position of a totally
+ /// different pattern!
+ reverse: DFA,
+ /// Whether iterators on this type should advance by one codepoint or one
+ /// byte when an empty match is seen.
+ utf8: bool,
+}
+
+/// Convenience routines for regex and cache construction.
+impl Regex {
+ /// Parse the given regular expression using the default configuration and
+ /// return the corresponding regex.
+ ///
+ /// If you want a non-default configuration, then use the [`Builder`] to
+ /// set your own configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 3, 14)),
+ /// re.find_leftmost(&mut cache, b"zzzfoo12345barzzz"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new(pattern: &str) -> Result<Regex, BuildError> {
+ Regex::builder().build(pattern)
+ }
+
+ /// Like `new`, but parses multiple patterns into a single "regex set."
+ /// This similarly uses the default regex configuration.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let mut it = re.find_leftmost_iter(
+ /// &mut cache,
+ /// b"abc 1 foo 4567 0 quux",
+ /// );
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 4, 5)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 6, 9)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 10, 14)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(1, 15, 16)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 17, 21)), it.next());
+ /// assert_eq!(None, it.next());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn new_many<P: AsRef<str>>(
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ Regex::builder().build_many(patterns)
+ }
+
+ /// Return a default configuration for a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of a regex.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to disable UTF-8 mode for `Regex` iteration.
+ /// When UTF-8 mode is disabled, the position immediately following an
+ /// empty match is where the next search begins, instead of the next
+ /// position of a UTF-8 encoded codepoint.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .build(r"")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a builder for configuring the construction of a `Regex`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Builder`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to use the builder to disable UTF-8 mode
+ /// everywhere.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::regex::Regex,
+ /// nfa::thompson,
+ /// MultiMatch, SyntaxConfig,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .syntax(SyntaxConfig::new().utf8(false))
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build(r"foo(?-u:[^b])ar.*")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+ /// let expected = Some(MultiMatch::must(0, 1, 9));
+ /// let got = re.find_leftmost(&mut cache, haystack);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn builder() -> Builder {
+ Builder::new()
+ }
+
+ /// Create a new cache for this `Regex`.
+ ///
+ /// The cache returned should only be used for searches for this
+ /// `Regex`. If you want to reuse the cache for another `Regex`, then
+ /// you must call [`Cache::reset`] with that `Regex` (or, equivalently,
+ /// [`Regex::reset_cache`]).
+ pub fn create_cache(&self) -> Cache {
+ Cache::new(self)
+ }
+
+ /// Reset the given cache such that it can be used for searching with the
+ /// this `Regex` (and only this `Regex`).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different `Regex`.
+ ///
+ /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+ /// `Regex` has been configured to "give up" after it has cleared the cache
+ /// a certain number of times.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different `Regex`.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re1 = Regex::new(r"\w")?;
+ /// let re2 = Regex::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 2)),
+ /// re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the Regex we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// re2.reset_cache(&mut cache);
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 3)),
+ /// re2.find_leftmost(&mut cache, "☃".as_bytes()),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset_cache(&self, cache: &mut Cache) {
+ self.forward().reset_cache(&mut cache.forward);
+ self.reverse().reset_cache(&mut cache.reverse);
+ }
+}
+
+/// Standard infallible search routines for finding and iterating over matches.
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_is_match`](Regex::try_is_match).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::hybrid::regex::Regex;
+ ///
+ /// let re = Regex::new("foo[0-9]+bar")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert_eq!(true, re.is_match(&mut cache, b"foo12345bar"));
+ /// assert_eq!(false, re.is_match(&mut cache, b"foobar"));
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn is_match(&self, cache: &mut Cache, haystack: &[u8]) -> bool {
+ self.try_is_match(cache, haystack).unwrap()
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest`](Regex::try_find_earliest).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// // Normally, the leftmost first match would greedily consume as many
+ /// // decimal digits as it could. But a match is detected as soon as one
+ /// // digit is seen.
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 4)),
+ /// re.find_earliest(&mut cache, b"foo12345"),
+ /// );
+ ///
+ /// // Normally, the end of the leftmost first match here would be 3,
+ /// // but the "earliest" match semantics detect a match earlier.
+ /// let re = Regex::new("abc|a")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 1)),
+ /// re.find_earliest(&mut cache, b"abc"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_earliest(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Option<MultiMatch> {
+ self.try_find_earliest(cache, haystack).unwrap()
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost`](Regex::try_find_leftmost).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// // Greediness is applied appropriately when compared to find_earliest.
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 3, 11)),
+ /// re.find_leftmost(&mut cache, b"zzzfoo12345zzz"),
+ /// );
+ ///
+ /// // Even though a match is found after reading the first byte (`a`),
+ /// // the default leftmost-first match semantics demand that we find the
+ /// // earliest match that prefers earlier parts of the pattern over latter
+ /// // parts.
+ /// let re = Regex::new("abc|a")?;
+ /// let mut cache = re.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 3)),
+ /// re.find_leftmost(&mut cache, b"abc"),
+ /// );
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_leftmost(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Option<MultiMatch> {
+ self.try_find_leftmost(cache, haystack).unwrap()
+ }
+
+ /// Search for the first overlapping match in `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping`](Regex::try_find_overlapping).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an overlapping search with multiple
+ /// regexes.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, regex::Regex, OverlappingState},
+ /// MatchKind,
+ /// MultiMatch,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .dfa(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "@foo".as_bytes();
+ /// let mut state = OverlappingState::start();
+ ///
+ /// let expected = Some(MultiMatch::must(1, 0, 4));
+ /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
+ /// assert_eq!(expected, got);
+ ///
+ /// // The first pattern also matches at the same position, so re-running
+ /// // the search will yield another match. Notice also that the first
+ /// // pattern is returned after the second. This is because the second
+ /// // pattern begins its match before the first, is therefore an earlier
+ /// // match and is thus reported first.
+ /// let expected = Some(MultiMatch::must(0, 1, 4));
+ /// let got = re.find_overlapping(&mut cache, haystack, &mut state);
+ /// assert_eq!(expected, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_overlapping(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ state: &mut OverlappingState,
+ ) -> Option<MultiMatch> {
+ self.try_find_overlapping(cache, haystack, state).unwrap()
+ }
+
+ /// Returns an iterator over all non-overlapping "earliest" matches.
+ ///
+ /// Match positions are reported as soon as a match is known to occur, even
+ /// if the standard leftmost match would be longer.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest_iter`](Regex::try_find_earliest_iter).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an "earliest" iterator.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::new("[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ /// let haystack = "123".as_bytes();
+ ///
+ /// // Normally, a standard leftmost iterator would return a single
+ /// // match, but since "earliest" detects matches earlier, we get
+ /// // three matches.
+ /// let mut it = re.find_earliest_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 3)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_earliest_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> FindEarliestMatches<'r, 'c, 't> {
+ FindEarliestMatches::new(self, cache, haystack)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// This corresponds to the "standard" regex search iterator.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost_iter`](Regex::try_find_leftmost_iter).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// let re = Regex::new("foo[0-9]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let text = b"foo1 foo12 foo123";
+ /// let matches: Vec<MultiMatch> = re
+ /// .find_leftmost_iter(&mut cache, text)
+ /// .collect();
+ /// assert_eq!(matches, vec![
+ /// MultiMatch::must(0, 0, 4),
+ /// MultiMatch::must(0, 5, 10),
+ /// MultiMatch::must(0, 11, 17),
+ /// ]);
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_leftmost_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> FindLeftmostMatches<'r, 'c, 't> {
+ FindLeftmostMatches::new(self, cache, haystack)
+ }
+
+ /// Returns an iterator over all overlapping matches in the given haystack.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// The iterator takes care of handling the overlapping state that must be
+ /// threaded through every search.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping_iter`](Regex::try_find_overlapping_iter).
+ ///
+ /// # Example
+ ///
+ /// This example shows how to run an overlapping search with multiple
+ /// regexes.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// hybrid::{dfa::DFA, regex::Regex},
+ /// MatchKind,
+ /// MultiMatch,
+ /// };
+ ///
+ /// let re = Regex::builder()
+ /// .dfa(DFA::config().match_kind(MatchKind::All))
+ /// .build_many(&[r"\w+$", r"\S+$"])?;
+ /// let mut cache = re.create_cache();
+ /// let haystack = "@foo".as_bytes();
+ ///
+ /// let mut it = re.find_overlapping_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(1, 0, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 4)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn find_overlapping_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> FindOverlappingMatches<'r, 'c, 't> {
+ FindOverlappingMatches::new(self, cache, haystack)
+ }
+}
+
+/// Lower level infallible search routines that permit controlling where
+/// the search starts and ends in a particular sequence. This is useful for
+/// executing searches that need to take surrounding context into account. This
+/// is required for correctly implementing iteration because of look-around
+/// operators (`^`, `$`, `\b`).
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_is_match_at`](Regex::try_is_match_at).
+ pub fn is_match_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ self.try_is_match_at(cache, haystack, start, end).unwrap()
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_earliest_at`](Regex::try_find_earliest_at).
+ pub fn find_earliest_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<MultiMatch> {
+ self.try_find_earliest_at(cache, haystack, start, end).unwrap()
+ }
+
+ /// Returns the same as `find_leftmost`, but starts the search at the given
+ /// offset.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches within the
+ /// same haystack, which cannot be done correctly by simply providing a
+ /// subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_leftmost_at`](Regex::try_find_leftmost_at).
+ pub fn find_leftmost_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<MultiMatch> {
+ self.try_find_leftmost_at(cache, haystack, start, end).unwrap()
+ }
+
+ /// Search for the first overlapping match within a given range of
+ /// `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Panics
+ ///
+ /// If the underlying lazy DFAs return an error, then this routine panics.
+ /// This only occurs in non-default configurations where quit bytes are
+ /// used, Unicode word boundaries are heuristically enabled or limits are
+ /// set on the number of times the lazy DFA's cache may be cleared.
+ ///
+ /// The fallible version of this routine is
+ /// [`try_find_overlapping_at`](Regex::try_find_overlapping_at).
+ pub fn find_overlapping_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Option<MultiMatch> {
+ self.try_find_overlapping_at(cache, haystack, start, end, state)
+ .unwrap()
+ }
+}
+
+/// Fallible search routines. These may return an error when the underlying
+/// lazy DFAs have been configured in a way that permits them to fail during a
+/// search.
+///
+/// Errors during search only occur when the lazy DFA has been explicitly
+/// configured to do so, usually by specifying one or more "quit" bytes or by
+/// heuristically enabling Unicode word boundaries.
+///
+/// Errors will never be returned using the default configuration. So these
+/// fallible routines are only needed for particular configurations.
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`is_match`](Regex::is_match).
+ pub fn try_is_match(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Result<bool, MatchError> {
+ self.try_is_match_at(cache, haystack, 0, haystack.len())
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest`](Regex::find_earliest).
+ pub fn try_find_earliest(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_earliest_at(cache, haystack, 0, haystack.len())
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost`](Regex::find_leftmost).
+ pub fn try_find_leftmost(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_leftmost_at(cache, haystack, 0, haystack.len())
+ }
+
+ /// Search for the first overlapping match in `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping`](Regex::find_overlapping).
+ pub fn try_find_overlapping(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_overlapping_at(cache, haystack, 0, haystack.len(), state)
+ }
+
+ /// Returns an iterator over all non-overlapping "earliest" matches.
+ ///
+ /// Match positions are reported as soon as a match is known to occur, even
+ /// if the standard leftmost match would be longer.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest_iter`](Regex::find_earliest_iter).
+ pub fn try_find_earliest_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> TryFindEarliestMatches<'r, 'c, 't> {
+ TryFindEarliestMatches::new(self, cache, haystack)
+ }
+
+ /// Returns an iterator over all non-overlapping leftmost matches in the
+ /// given bytes. If no match exists, then the iterator yields no elements.
+ ///
+ /// This corresponds to the "standard" regex search iterator.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost_iter`](Regex::find_leftmost_iter).
+ pub fn try_find_leftmost_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> TryFindLeftmostMatches<'r, 'c, 't> {
+ TryFindLeftmostMatches::new(self, cache, haystack)
+ }
+
+ /// Returns an iterator over all overlapping matches in the given haystack.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// The iterator takes care of handling the overlapping state that must be
+ /// threaded through every search.
+ ///
+ /// # Errors
+ ///
+ /// This iterator only yields errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping_iter`](Regex::find_overlapping_iter).
+ pub fn try_find_overlapping_iter<'r, 'c, 't>(
+ &'r self,
+ cache: &'c mut Cache,
+ haystack: &'t [u8],
+ ) -> TryFindOverlappingMatches<'r, 'c, 't> {
+ TryFindOverlappingMatches::new(self, cache, haystack)
+ }
+}
+
+/// Lower level fallible search routines that permit controlling where the
+/// search starts and ends in a particular sequence.
+impl Regex {
+ /// Returns true if and only if this regex matches the given haystack.
+ ///
+ /// This routine may short circuit if it knows that scanning future input
+ /// will never lead to a different result. In particular, if the underlying
+ /// DFA enters a match state or a dead state, then this routine will return
+ /// `true` or `false`, respectively, without inspecting any future input.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`is_match_at`](Regex::is_match_at).
+ pub fn try_is_match_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<bool, MatchError> {
+ self.forward()
+ .find_leftmost_fwd_at(
+ &mut cache.forward,
+ self.scanner().as_mut(),
+ None,
+ haystack,
+ start,
+ end,
+ )
+ .map(|x| x.is_some())
+ }
+
+ /// Returns the first position at which a match is found.
+ ///
+ /// This routine stops scanning input in precisely the same circumstances
+ /// as `is_match`. The key difference is that this routine returns the
+ /// position at which it stopped scanning input if and only if a match
+ /// was found. If no match is found, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_earliest_at`](Regex::find_earliest_at).
+ pub fn try_find_earliest_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_earliest_at_imp(
+ self.scanner().as_mut(),
+ cache,
+ haystack,
+ start,
+ end,
+ )
+ }
+
+ /// Returns the start and end offset of the leftmost match. If no match
+ /// exists, then `None` is returned.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_leftmost_at`](Regex::find_leftmost_at).
+ pub fn try_find_leftmost_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_leftmost_at_imp(
+ self.scanner().as_mut(),
+ cache,
+ haystack,
+ start,
+ end,
+ )
+ }
+
+ /// Search for the first overlapping match within a given range of
+ /// `haystack`.
+ ///
+ /// This routine is principally useful when searching for multiple patterns
+ /// on inputs where multiple patterns may match the same regions of text.
+ /// In particular, callers must preserve the automaton's search state from
+ /// prior calls so that the implementation knows where the last match
+ /// occurred and which pattern was reported.
+ ///
+ /// # Searching a substring of the haystack
+ ///
+ /// Being an "at" search routine, this permits callers to search a
+ /// substring of `haystack` by specifying a range in `haystack`.
+ /// Why expose this as an API instead of just asking callers to use
+ /// `&input[start..end]`? The reason is that regex matching often wants
+ /// to take the surrounding context into account in order to handle
+ /// look-around (`^`, `$` and `\b`).
+ ///
+ /// This is useful when implementing an iterator over matches
+ /// within the same haystack, which cannot be done correctly by simply
+ /// providing a subslice of `haystack`.
+ ///
+ /// # Errors
+ ///
+ /// This routine only errors if the search could not complete. For
+ /// DFA-based regexes, this only occurs in a non-default configuration
+ /// where quit bytes are used, Unicode word boundaries are heuristically
+ /// enabled or limits are set on the number of times the lazy DFA's cache
+ /// may be cleared.
+ ///
+ /// When a search cannot complete, callers cannot know whether a match
+ /// exists or not.
+ ///
+ /// The infallible (panics on error) version of this routine is
+ /// [`find_overlapping_at`](Regex::find_overlapping_at).
+ pub fn try_find_overlapping_at(
+ &self,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ self.try_find_overlapping_at_imp(
+ self.scanner().as_mut(),
+ cache,
+ haystack,
+ start,
+ end,
+ state,
+ )
+ }
+}
+
+impl Regex {
+ #[inline(always)]
+ fn try_find_earliest_at_imp(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ let (fdfa, rdfa) = (self.forward(), self.reverse());
+ let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+ let end = match fdfa
+ .find_earliest_fwd_at(fcache, pre, None, haystack, start, end)?
+ {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // N.B. The only time we need to tell the reverse searcher the pattern
+ // to match is in the overlapping case, since it's ambiguous. In the
+ // earliest case, I have tentatively convinced myself that it isn't
+ // necessary and the reverse search will always find the same pattern
+ // to match as the forward search. But I lack a rigorous proof. Why not
+ // just provide the pattern anyway? Well, if it is needed, then leaving
+ // it out gives us a chance to find a witness.
+ let start = rdfa
+ .find_earliest_rev_at(rcache, None, haystack, start, end.offset())?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+
+ #[inline(always)]
+ fn try_find_leftmost_at_imp(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ let (fdfa, rdfa) = (self.forward(), self.reverse());
+ let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+ let end = match fdfa
+ .find_leftmost_fwd_at(fcache, pre, None, haystack, start, end)?
+ {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // N.B. The only time we need to tell the reverse searcher the pattern
+ // to match is in the overlapping case, since it's ambiguous. In the
+ // leftmost case, I have tentatively convinced myself that it isn't
+ // necessary and the reverse search will always find the same pattern
+ // to match as the forward search. But I lack a rigorous proof. Why not
+ // just provide the pattern anyway? Well, if it is needed, then leaving
+ // it out gives us a chance to find a witness.
+ let start = rdfa
+ .find_leftmost_rev_at(rcache, None, haystack, start, end.offset())?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+
+ #[inline(always)]
+ fn try_find_overlapping_at_imp(
+ &self,
+ pre: Option<&mut prefilter::Scanner>,
+ cache: &mut Cache,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+ state: &mut OverlappingState,
+ ) -> Result<Option<MultiMatch>, MatchError> {
+ let (fdfa, rdfa) = (self.forward(), self.reverse());
+ let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
+ let end = match fdfa.find_overlapping_fwd_at(
+ fcache, pre, None, haystack, start, end, state,
+ )? {
+ None => return Ok(None),
+ Some(end) => end,
+ };
+ // Unlike the leftmost cases, the reverse overlapping search may match
+ // a different pattern than the forward search. See test failures when
+ // using `None` instead of `Some(end.pattern())` below. Thus, we must
+ // run our reverse search using the pattern that matched in the forward
+ // direction.
+ let start = rdfa
+ .find_leftmost_rev_at(
+ rcache,
+ Some(end.pattern()),
+ haystack,
+ 0,
+ end.offset(),
+ )?
+ .expect("reverse search must match if forward search does");
+ assert_eq!(
+ start.pattern(),
+ end.pattern(),
+ "forward and reverse search must match same pattern",
+ );
+ assert!(start.offset() <= end.offset());
+ Ok(Some(MultiMatch::new(end.pattern(), start.offset(), end.offset())))
+ }
+}
+
+/// Non-search APIs for querying information about the regex and setting a
+/// prefilter.
+impl Regex {
+ /// Return the underlying lazy DFA responsible for forward matching.
+ ///
+ /// This is useful for accessing the underlying lazy DFA and using it
+ /// directly if the situation calls for it.
+ pub fn forward(&self) -> &DFA {
+ &self.forward
+ }
+
+ /// Return the underlying lazy DFA responsible for reverse matching.
+ ///
+ /// This is useful for accessing the underlying lazy DFA and using it
+ /// directly if the situation calls for it.
+ pub fn reverse(&self) -> &DFA {
+ &self.reverse
+ }
+
+ /// Returns the total number of patterns matched by this regex.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{MultiMatch, hybrid::regex::Regex};
+ ///
+ /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
+ /// assert_eq!(3, re.pattern_count());
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn pattern_count(&self) -> usize {
+ assert_eq!(
+ self.forward().pattern_count(),
+ self.reverse().pattern_count()
+ );
+ self.forward().pattern_count()
+ }
+
+ /// Convenience function for returning this regex's prefilter as a trait
+ /// object.
+ ///
+ /// If this regex doesn't have a prefilter, then `None` is returned.
+ pub fn prefilter(&self) -> Option<&dyn Prefilter> {
+ self.pre.as_ref().map(|x| &**x)
+ }
+
+ /// Attach the given prefilter to this regex.
+ pub fn set_prefilter(&mut self, pre: Option<Box<dyn Prefilter>>) {
+ self.pre = pre;
+ }
+
+ /// Convenience function for returning a prefilter scanner.
+ fn scanner(&self) -> Option<prefilter::Scanner> {
+ self.prefilter().map(prefilter::Scanner::new)
+ }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindEarliestMatches<'r, 'c, 't>(TryFindEarliestMatches<'r, 'c, 't>);
+
+impl<'r, 'c, 't> FindEarliestMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> FindEarliestMatches<'r, 'c, 't> {
+ FindEarliestMatches(TryFindEarliestMatches::new(re, cache, text))
+ }
+}
+
+impl<'r, 'c, 't> Iterator for FindEarliestMatches<'r, 'c, 't> {
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// infallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindLeftmostMatches<'r, 'c, 't>(TryFindLeftmostMatches<'r, 'c, 't>);
+
+impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> FindLeftmostMatches<'r, 'c, 't> {
+ FindLeftmostMatches(TryFindLeftmostMatches::new(re, cache, text))
+ }
+}
+
+impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> {
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all overlapping matches for a particular infallible
+/// search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found. If the underlying search returns an error, then this panics.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct FindOverlappingMatches<'r, 'c, 't>(
+ TryFindOverlappingMatches<'r, 'c, 't>,
+);
+
+impl<'r, 'c, 't> FindOverlappingMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> FindOverlappingMatches<'r, 'c, 't> {
+ FindOverlappingMatches(TryFindOverlappingMatches::new(re, cache, text))
+ }
+}
+
+impl<'r, 'c, 't> Iterator for FindOverlappingMatches<'r, 'c, 't> {
+ type Item = MultiMatch;
+
+ fn next(&mut self) -> Option<MultiMatch> {
+ next_unwrap(self.0.next())
+ }
+}
+
+/// An iterator over all non-overlapping earliest matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindEarliestMatches<'r, 'c, 't> {
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'r, 'c, 't> TryFindEarliestMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> TryFindEarliestMatches<'r, 'c, 't> {
+ let scanner = re.scanner();
+ TryFindEarliestMatches {
+ re,
+ cache,
+ scanner,
+ text,
+ last_end: 0,
+ last_match: None,
+ }
+ }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindEarliestMatches<'r, 'c, 't> {
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_earliest_at_imp(
+ self.scanner.as_mut(),
+ self.cache,
+ self.text,
+ self.last_end,
+ self.text.len(),
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ if m.is_empty() {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = if self.re.utf8 {
+ crate::util::next_utf8(self.text, m.end())
+ } else {
+ m.end() + 1
+ };
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(m.end()) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = m.end();
+ }
+ self.last_match = Some(m.end());
+ Some(Ok(m))
+ }
+}
+
+/// An iterator over all non-overlapping leftmost matches for a particular
+/// fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindLeftmostMatches<'r, 'c, 't> {
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ last_match: Option<usize>,
+}
+
+impl<'r, 'c, 't> TryFindLeftmostMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> TryFindLeftmostMatches<'r, 'c, 't> {
+ let scanner = re.scanner();
+ TryFindLeftmostMatches {
+ re,
+ cache,
+ scanner,
+ text,
+ last_end: 0,
+ last_match: None,
+ }
+ }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindLeftmostMatches<'r, 'c, 't> {
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_leftmost_at_imp(
+ self.scanner.as_mut(),
+ self.cache,
+ self.text,
+ self.last_end,
+ self.text.len(),
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ if m.is_empty() {
+ // This is an empty match. To ensure we make progress, start
+ // the next search at the smallest possible starting position
+ // of the next match following this one.
+ self.last_end = if self.re.utf8 {
+ crate::util::next_utf8(self.text, m.end())
+ } else {
+ m.end() + 1
+ };
+ // Don't accept empty matches immediately following a match.
+ // Just move on to the next match.
+ if Some(m.end()) == self.last_match {
+ return self.next();
+ }
+ } else {
+ self.last_end = m.end();
+ }
+ self.last_match = Some(m.end());
+ Some(Ok(m))
+ }
+}
+
+/// An iterator over all overlapping matches for a particular fallible search.
+///
+/// The iterator yields a [`MultiMatch`] value until no more matches could be
+/// found.
+///
+/// The lifetime variables are as follows:
+///
+/// * `'r` is the lifetime of the regular expression itself.
+/// * `'c` is the lifetime of the mutable cache used during search.
+/// * `'t` is the lifetime of the text being searched.
+#[derive(Debug)]
+pub struct TryFindOverlappingMatches<'r, 'c, 't> {
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ scanner: Option<prefilter::Scanner<'r>>,
+ text: &'t [u8],
+ last_end: usize,
+ state: OverlappingState,
+}
+
+impl<'r, 'c, 't> TryFindOverlappingMatches<'r, 'c, 't> {
+ fn new(
+ re: &'r Regex,
+ cache: &'c mut Cache,
+ text: &'t [u8],
+ ) -> TryFindOverlappingMatches<'r, 'c, 't> {
+ let scanner = re.scanner();
+ TryFindOverlappingMatches {
+ re,
+ cache,
+ scanner,
+ text,
+ last_end: 0,
+ state: OverlappingState::start(),
+ }
+ }
+}
+
+impl<'r, 'c, 't> Iterator for TryFindOverlappingMatches<'r, 'c, 't> {
+ type Item = Result<MultiMatch, MatchError>;
+
+ fn next(&mut self) -> Option<Result<MultiMatch, MatchError>> {
+ if self.last_end > self.text.len() {
+ return None;
+ }
+ let result = self.re.try_find_overlapping_at_imp(
+ self.scanner.as_mut(),
+ self.cache,
+ self.text,
+ self.last_end,
+ self.text.len(),
+ &mut self.state,
+ );
+ let m = match result {
+ Err(err) => return Some(Err(err)),
+ Ok(None) => return None,
+ Ok(Some(m)) => m,
+ };
+ // Unlike the non-overlapping case, we're OK with empty matches at this
+ // level. In particular, the overlapping search algorithm is itself
+ // responsible for ensuring that progress is always made.
+ self.last_end = m.end();
+ Some(Ok(m))
+ }
+}
+
+/// A cache represents a partially computed forward and reverse DFA.
+///
+/// A cache is the key component that differentiates a classical DFA and a
+/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
+/// complete transition table that can handle all possible inputs, a hybrid
+/// NFA/DFA starts with an empty transition table and builds only the parts
+/// required during search. The parts that are built are stored in a cache. For
+/// this reason, a cache is a required parameter for nearly every operation on
+/// a [`Regex`].
+///
+/// Caches can be created from their corresponding `Regex` via
+/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
+/// that created it, or the `Regex` that was most recently used to reset it
+/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
+/// panics or incorrect results.
+#[derive(Debug, Clone)]
+pub struct Cache {
+ forward: dfa::Cache,
+ reverse: dfa::Cache,
+}
+
+impl Cache {
+ /// Create a new cache for the given `Regex`.
+ ///
+ /// The cache returned should only be used for searches for the given
+ /// `Regex`. If you want to reuse the cache for another `Regex`, then you
+ /// must call [`Cache::reset`] with that `Regex`.
+ pub fn new(re: &Regex) -> Cache {
+ let forward = dfa::Cache::new(re.forward());
+ let reverse = dfa::Cache::new(re.reverse());
+ Cache { forward, reverse }
+ }
+
+ /// Reset this cache such that it can be used for searching with the given
+ /// `Regex` (and only that `Regex`).
+ ///
+ /// A cache reset permits reusing memory already allocated in this cache
+ /// with a different `Regex`.
+ ///
+ /// Resetting a cache sets its "clear count" to 0. This is relevant if the
+ /// `Regex` has been configured to "give up" after it has cleared the cache
+ /// a certain number of times.
+ ///
+ /// # Example
+ ///
+ /// This shows how to re-purpose a cache for use with a different `Regex`.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re1 = Regex::new(r"\w")?;
+ /// let re2 = Regex::new(r"\W")?;
+ ///
+ /// let mut cache = re1.create_cache();
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 2)),
+ /// re1.find_leftmost(&mut cache, "Δ".as_bytes()),
+ /// );
+ ///
+ /// // Using 'cache' with re2 is not allowed. It may result in panics or
+ /// // incorrect results. In order to re-purpose the cache, we must reset
+ /// // it with the Regex we'd like to use it with.
+ /// //
+ /// // Similarly, after this reset, using the cache with 're1' is also not
+ /// // allowed.
+ /// cache.reset(&re2);
+ /// assert_eq!(
+ /// Some(MultiMatch::must(0, 0, 3)),
+ /// re2.find_leftmost(&mut cache, "☃".as_bytes()),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reset(&mut self, re: &Regex) {
+ self.forward.reset(re.forward());
+ self.reverse.reset(re.reverse());
+ }
+
+ /// Returns the heap memory usage, in bytes, as a sum of the forward and
+ /// reverse lazy DFA caches.
+ ///
+ /// This does **not** include the stack size used up by this cache. To
+ /// compute that, use `std::mem::size_of::<Cache>()`.
+ pub fn memory_usage(&self) -> usize {
+ self.forward.memory_usage() + self.reverse.memory_usage()
+ }
+
+ /// Return references to the forward and reverse caches, respectively.
+ pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {
+ (&self.forward, &self.reverse)
+ }
+
+ /// Return mutable references to the forward and reverse caches,
+ /// respectively.
+ pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
+ (&mut self.forward, &mut self.reverse)
+ }
+}
+
+/// The configuration used for compiling a hybrid NFA/DFA regex.
+///
+/// A regex configuration is a simple data object that is typically used with
+/// [`Builder::configure`].
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ utf8: Option<bool>,
+}
+
+impl Config {
+ /// Return a new default regex compiler configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Whether to enable UTF-8 mode or not.
+ ///
+ /// When UTF-8 mode is enabled (the default) and an empty match is seen,
+ /// the iterators on [`Regex`] will always start the next search at the
+ /// next UTF-8 encoded codepoint when searching valid UTF-8. When UTF-8
+ /// mode is disabled, such searches are begun at the next byte offset.
+ ///
+ /// If this mode is enabled and invalid UTF-8 is given to search, then
+ /// behavior is unspecified.
+ ///
+ /// Generally speaking, one should enable this when
+ /// [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8)
+ /// and
+ /// [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+ /// are enabled, and disable it otherwise.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates the differences between when this option is
+ /// enabled and disabled. The differences only arise when the regex can
+ /// return matches of length zero.
+ ///
+ /// In this first snippet, we show the results when UTF-8 mode is disabled.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(false))
+ /// .build(r"")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 2, 2)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 3, 3)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// And in this snippet, we execute the same search on the same haystack,
+ /// but with UTF-8 mode enabled. Notice that byte offsets that would
+ /// otherwise split the encoding of `☃` are not returned.
+ ///
+ /// ```
+ /// use regex_automata::{hybrid::regex::Regex, MultiMatch};
+ ///
+ /// let re = Regex::builder()
+ /// .configure(Regex::config().utf8(true))
+ /// .build(r"")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// let haystack = "a☃z".as_bytes();
+ /// let mut it = re.find_leftmost_iter(&mut cache, haystack);
+ /// assert_eq!(Some(MultiMatch::must(0, 0, 0)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 1, 1)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 4, 4)), it.next());
+ /// assert_eq!(Some(MultiMatch::must(0, 5, 5)), it.next());
+ /// assert_eq!(None, it.next());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = Some(yes);
+ self
+ }
+
+ /// Returns true if and only if this configuration has UTF-8 mode enabled.
+ ///
+ /// When UTF-8 mode is enabled and an empty match is seen, the iterators on
+ /// [`Regex`] will always start the next search at the next UTF-8 encoded
+ /// codepoint. When UTF-8 mode is disabled, such searches are begun at the
+ /// next byte offset.
+ pub fn get_utf8(&self) -> bool {
+ self.utf8.unwrap_or(true)
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(self, o: Config) -> Config {
+ Config { utf8: o.utf8.or(self.utf8) }
+ }
+}
+
+/// A builder for a regex based on a hybrid NFA/DFA.
+///
+/// This builder permits configuring options for the syntax of a pattern, the
+/// NFA construction, the lazy DFA construction and finally the regex searching
+/// itself. This builder is different from a general purpose regex builder
+/// in that it permits fine grain configuration of the construction process.
+/// The trade off for this is complexity, and the possibility of setting a
+/// configuration that might not make sense. For example, there are three
+/// different UTF-8 modes:
+///
+/// * [`SyntaxConfig::utf8`](crate::SyntaxConfig::utf8) controls whether the
+/// pattern itself can contain sub-expressions that match invalid UTF-8.
+/// * [`nfa::thompson::Config::utf8`](crate::nfa::thompson::Config::utf8)
+/// controls whether the implicit unanchored prefix added to the NFA can
+/// match through invalid UTF-8 or not.
+/// * [`Config::utf8`] controls how the regex iterators themselves advance
+/// the starting position of the next search when a match with zero length is
+/// found.
+///
+/// Generally speaking, callers will want to either enable all of these or
+/// disable all of these.
+///
+/// Internally, building a regex requires building two hybrid NFA/DFAs,
+/// where one is responsible for finding the end of a match and the other is
+/// responsible for finding the start of a match. If you only need to detect
+/// whether something matched, or only the end of a match, then you should use
+/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper
+/// than building two of them.
+///
+/// # Example
+///
+/// This example shows how to disable UTF-8 mode in the syntax, the NFA and
+/// the regex itself. This is generally what you want for matching on
+/// arbitrary bytes.
+///
+/// ```
+/// use regex_automata::{
+/// hybrid::regex::Regex, nfa::thompson, MultiMatch, SyntaxConfig
+/// };
+///
+/// let re = Regex::builder()
+/// .configure(Regex::config().utf8(false))
+/// .syntax(SyntaxConfig::new().utf8(false))
+/// .thompson(thompson::Config::new().utf8(false))
+/// .build(r"foo(?-u:[^b])ar.*")?;
+/// let mut cache = re.create_cache();
+///
+/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
+/// let expected = Some(MultiMatch::must(0, 1, 9));
+/// let got = re.find_leftmost(&mut cache, haystack);
+/// assert_eq!(expected, got);
+/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
+/// // but the subsequent `.*` does not! Disabling UTF-8
+/// // on the syntax permits this. Notice also that the
+/// // search was unanchored and skipped over invalid UTF-8.
+/// // Disabling UTF-8 on the Thompson NFA permits this.
+/// //
+/// // N.B. This example does not show the impact of
+/// // disabling UTF-8 mode on Config, since that
+/// // only impacts regexes that can produce matches of
+/// // length 0.
+/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ dfa: dfa::Builder,
+}
+
+impl Builder {
+ /// Create a new regex builder with the default configuration.
+ pub fn new() -> Builder {
+ Builder { config: Config::default(), dfa: DFA::builder() }
+ }
+
+ /// Build a regex from the given pattern.
+ ///
+ /// If there was a problem parsing or compiling the pattern, then an error
+ /// is returned.
+ pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Build a regex from the given patterns.
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<Regex, BuildError> {
+ let forward = self.dfa.build_many(patterns)?;
+ let reverse = self
+ .dfa
+ .clone()
+ .configure(
+ DFA::config()
+ .anchored(true)
+ .match_kind(MatchKind::All)
+ .starts_for_each_pattern(true),
+ )
+ .thompson(thompson::Config::new().reverse(true))
+ .build_many(patterns)?;
+ Ok(self.build_from_dfas(forward, reverse))
+ }
+
+ /// Build a regex from its component forward and reverse hybrid NFA/DFAs.
+ fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
+ // The congruous method on DFA-backed regexes is exposed, but it's
+ // not clear this builder is useful here since lazy DFAs can't be
+ // serialized and there is only one type of them.
+ let utf8 = self.config.get_utf8();
+ Regex { pre: None, forward, reverse, utf8 }
+ }
+
+ /// Apply the given regex configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](crate::SyntaxConfig).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ self.dfa.syntax(config);
+ self
+ }
+
+ /// Set the Thompson NFA configuration for this builder using
+ /// [`nfa::thompson::Config`](thompson::Config).
+ ///
+ /// This permits setting things like whether additional time should be
+ /// spent shrinking the size of the NFA.
+ pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
+ self.dfa.thompson(config);
+ self
+ }
+
+ /// Set the lazy DFA compilation configuration for this builder using
+ /// [`dfa::Config`](dfa::Config).
+ ///
+ /// This permits setting things like whether Unicode word boundaries should
+ /// be heuristically supported or settings how the behavior of the cache.
+ pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {
+ self.dfa.configure(config);
+ self
+ }
+}
+
+impl Default for Builder {
+ fn default() -> Builder {
+ Builder::new()
+ }
+}
+
+#[inline(always)]
+fn next_unwrap(
+ item: Option<Result<MultiMatch, MatchError>>,
+) -> Option<MultiMatch> {
+ match item {
+ None => None,
+ Some(Ok(m)) => Some(m),
+ Some(Err(err)) => panic!(
+ "unexpected regex search error: {}\n\
+ to handle search errors, use try_ methods",
+ err,
+ ),
+ }
+}
diff --git a/vendor/regex-automata/src/hybrid/search.rs b/vendor/regex-automata/src/hybrid/search.rs
new file mode 100644
index 000000000..92760cee2
--- /dev/null
+++ b/vendor/regex-automata/src/hybrid/search.rs
@@ -0,0 +1,663 @@
+use crate::{
+ hybrid::{
+ dfa::{Cache, DFA},
+ id::{LazyStateID, OverlappingState, StateMatch},
+ },
+ nfa::thompson,
+ util::{
+ id::PatternID,
+ matchtypes::{HalfMatch, MatchError},
+ prefilter, MATCH_OFFSET,
+ },
+};
+
+#[inline(never)]
+pub(crate) fn find_earliest_fwd(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should never use
+ // a prefilter.
+ if pre.is_some() && pattern_id.is_none() {
+ find_fwd(pre, true, dfa, cache, pattern_id, bytes, start, end)
+ } else {
+ find_fwd(None, true, dfa, cache, pattern_id, bytes, start, end)
+ }
+}
+
+#[inline(never)]
+pub(crate) fn find_leftmost_fwd(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should never use
+ // a prefilter.
+ if pre.is_some() && pattern_id.is_none() {
+ find_fwd(pre, false, dfa, cache, pattern_id, bytes, start, end)
+ } else {
+ find_fwd(None, false, dfa, cache, pattern_id, bytes, start, end)
+ }
+}
+
+#[inline(always)]
+fn find_fwd(
+ mut pre: Option<&mut prefilter::Scanner>,
+ earliest: bool,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= haystack.len());
+ assert!(end <= haystack.len());
+
+ // Why do this? This lets 'bytes[at]' work without bounds checks below.
+ // It seems the assert on 'end <= haystack.len()' above is otherwise
+ // not enough. Why not just make 'bytes' scoped this way anyway? Well,
+ // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
+ // for resolving look-ahead.
+ let bytes = &haystack[..end];
+
+ let mut sid = init_fwd(dfa, cache, pattern_id, haystack, start, end)?;
+ let mut last_match = None;
+ let mut at = start;
+ if let Some(ref mut pre) = pre {
+ // If a prefilter doesn't report false positives, then we don't need to
+ // touch the DFA at all. However, since all matches include the pattern
+ // ID, and the prefilter infrastructure doesn't report pattern IDs, we
+ // limit this optimization to cases where there is exactly one pattern.
+ // In that case, any match must be the 0th pattern.
+ if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
+ return Ok(pre.next_candidate(bytes, at).into_option().map(
+ |offset| HalfMatch { pattern: PatternID::ZERO, offset },
+ ));
+ } else if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ while at < end {
+ if sid.is_tagged() {
+ sid = dfa
+ .next_state(cache, sid, bytes[at])
+ .map_err(|_| gave_up(at))?;
+ at += 1;
+ } else {
+ // SAFETY: There are two safety invariants we need to uphold
+ // here in the loop below: that 'sid' is a valid state ID for
+ // this DFA, and that 'at' is a valid index into 'bytes'. For
+ // the former, we rely on the invariant that next_state* and
+ // start_state_forward always returns a valid state ID (given a
+ // valid state ID in the former case), and that we are only at this
+ // place in the code if 'sid' is untagged. Moreover, every call to
+ // next_state_untagged_unchecked below is guarded by a check that
+ // sid is untagged. For the latter safety invariant, we always
+ // guard unchecked access with a check that 'at' is less than
+ // 'end', where 'end == bytes.len()'.
+ //
+ // For justification, this gives us a ~10% bump in search time.
+ // This was used for a benchmark:
+ //
+ // regex-cli find hybrid regex @/some/big/file '(?m)^.+$' -UBb
+ //
+ // With bounds checked: ~881.4ms. Without: ~775ms. For input, I
+ // used OpenSubtitles2018.raw.sample.medium.en.
+ let mut prev_sid = sid;
+ while at < end {
+ prev_sid = sid;
+ sid = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ at += 1;
+ if sid.is_tagged() {
+ break;
+ }
+ // SAFETY: we make four unguarded accesses to 'bytes[at]'
+ // below, and each are safe because we know that 'at + 4' is
+ // in bounds. Moreover, while we don't check whether 'sid' is
+ // untagged directly, we know it is because of the check above.
+ // And the unrolled loop below quits when the next state is not
+ // equal to the previous state.
+ //
+ // PERF: For justification for eliminating bounds checks,
+ // see above. For justification for the unrolling, we use
+ // two tests. The one above with regex '(?m)^.+$', and also
+ // '(?m)^.{40}$'. The former is kinda the best case for
+ // unrolling, and gives a 1.67 boost primarily because the DFA
+ // spends most of its time munching through the input in the
+ // same state. But the latter pattern rarely spends time in the
+ // same state through subsequent transitions, so unrolling is
+ // pretty much always ineffective in that it craps out on the
+ // first 'sid != next' check below. However, without unrolling,
+ // search is only 1.03 times faster than with unrolling on the
+ // latter pattern, which we deem to be an acceptable loss in
+ // favor of optimizing the more common case of having a "hot"
+ // state somewhere in the DFA.
+ while at + 4 < end {
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at += 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at += 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at += 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at += 1;
+ }
+ }
+ if sid.is_unknown() {
+ sid = dfa
+ .next_state(cache, prev_sid, bytes[at - 1])
+ .map_err(|_| gave_up(at - 1))?;
+ }
+ }
+ if sid.is_tagged() {
+ if sid.is_start() {
+ if let Some(ref mut pre) = pre {
+ if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ } else if sid.is_match() {
+ last_match = Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset: at - MATCH_OFFSET,
+ });
+ if earliest {
+ return Ok(last_match);
+ }
+ } else if sid.is_dead() {
+ return Ok(last_match);
+ } else if sid.is_quit() {
+ if last_match.is_some() {
+ return Ok(last_match);
+ }
+ let offset = at - 1;
+ return Err(MatchError::Quit { byte: bytes[offset], offset });
+ } else {
+ debug_assert!(sid.is_unknown());
+ unreachable!("sid being unknown is a bug");
+ }
+ }
+ }
+ // We are careful to use 'haystack' here, which contains the full context
+ // that we might want to inspect.
+ Ok(eoi_fwd(dfa, cache, haystack, end, &mut sid)?.or(last_match))
+}
+
+#[inline(never)]
+pub(crate) fn find_earliest_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ find_rev(true, dfa, cache, pattern_id, bytes, start, end)
+}
+
+#[inline(never)]
+pub(crate) fn find_leftmost_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ find_rev(false, dfa, cache, pattern_id, bytes, start, end)
+}
+
+#[inline(always)]
+fn find_rev(
+ earliest: bool,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ haystack: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= haystack.len());
+ assert!(end <= haystack.len());
+
+ // Why do this? This lets 'bytes[at]' work without bounds checks below.
+ // It seems the assert on 'end <= haystack.len()' above is otherwise
+ // not enough. Why not just make 'bytes' scoped this way anyway? Well,
+ // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
+ // for resolving look-ahead.
+ let bytes = &haystack[start..];
+
+ let mut sid = init_rev(dfa, cache, pattern_id, haystack, start, end)?;
+ let mut last_match = None;
+ let mut at = end - start;
+ while at > 0 {
+ if sid.is_tagged() {
+ at -= 1;
+ sid = dfa
+ .next_state(cache, sid, bytes[at])
+ .map_err(|_| gave_up(at))?;
+ } else {
+ // SAFETY: See comments in 'find_fwd' for both a safety argument
+ // and a justification from a performance perspective as to 1) why
+ // we elide bounds checks and 2) why we do a specialized version of
+ // unrolling below.
+ let mut prev_sid = sid;
+ while at > 0 && !sid.is_tagged() {
+ prev_sid = sid;
+ at -= 1;
+ while at > 3 {
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at -= 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at -= 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at -= 1;
+ let next = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ if sid != next {
+ break;
+ }
+ at -= 1;
+ }
+ sid = unsafe {
+ dfa.next_state_untagged_unchecked(
+ cache,
+ sid,
+ *bytes.get_unchecked(at),
+ )
+ };
+ }
+ if sid.is_unknown() {
+ sid = dfa
+ .next_state(cache, prev_sid, bytes[at])
+ .map_err(|_| gave_up(at))?;
+ }
+ }
+ if sid.is_tagged() {
+ if sid.is_start() {
+ continue;
+ } else if sid.is_match() {
+ last_match = Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset: start + at + MATCH_OFFSET,
+ });
+ if earliest {
+ return Ok(last_match);
+ }
+ } else if sid.is_dead() {
+ return Ok(last_match);
+ } else {
+ debug_assert!(sid.is_quit());
+ if last_match.is_some() {
+ return Ok(last_match);
+ }
+ return Err(MatchError::Quit { byte: bytes[at], offset: at });
+ }
+ }
+ }
+ Ok(eoi_rev(dfa, cache, haystack, start, sid)?.or(last_match))
+}
+
+#[inline(never)]
+pub(crate) fn find_overlapping_fwd(
+ pre: Option<&mut prefilter::Scanner>,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+ caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+ // Searching with a pattern ID is always anchored, so we should only ever
+ // use a prefilter when no pattern ID is given.
+ if pre.is_some() && pattern_id.is_none() {
+ find_overlapping_fwd_imp(
+ pre,
+ dfa,
+ cache,
+ pattern_id,
+ bytes,
+ start,
+ end,
+ caller_state,
+ )
+ } else {
+ find_overlapping_fwd_imp(
+ None,
+ dfa,
+ cache,
+ pattern_id,
+ bytes,
+ start,
+ end,
+ caller_state,
+ )
+ }
+}
+
+#[inline(always)]
+fn find_overlapping_fwd_imp(
+ mut pre: Option<&mut prefilter::Scanner>,
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ mut start: usize,
+ end: usize,
+ caller_state: &mut OverlappingState,
+) -> Result<Option<HalfMatch>, MatchError> {
+ assert!(start <= end);
+ assert!(start <= bytes.len());
+ assert!(end <= bytes.len());
+
+ let mut sid = match caller_state.id() {
+ None => init_fwd(dfa, cache, pattern_id, bytes, start, end)?,
+ Some(sid) => {
+ if let Some(last) = caller_state.last_match() {
+ let match_count = dfa.match_count(cache, sid);
+ if last.match_index < match_count {
+ let m = HalfMatch {
+ pattern: dfa.match_pattern(
+ cache,
+ sid,
+ last.match_index,
+ ),
+ offset: last.offset,
+ };
+ last.match_index += 1;
+ return Ok(Some(m));
+ }
+ }
+
+ // This is a subtle but critical detail. If the caller provides a
+ // non-None state ID, then it must be the case that the state ID
+ // corresponds to one set by this function. The state ID therefore
+ // corresponds to a match state, a dead state or some other state.
+ // However, "some other" state _only_ occurs when the input has
+ // been exhausted because the only way to stop before then is to
+ // see a match or a dead/quit state.
+ //
+ // If the input is exhausted or if it's a dead state, then
+ // incrementing the starting position has no relevance on
+ // correctness, since the loop below will either not execute
+ // at all or will immediately stop due to being in a dead state.
+ // (Once in a dead state it is impossible to leave it.)
+ //
+ // Therefore, the only case we need to consider is when
+ // caller_state is a match state. In this case, since our machines
+ // support the ability to delay a match by a certain number of
+ // bytes (to support look-around), it follows that we actually
+ // consumed that many additional bytes on our previous search. When
+ // the caller resumes their search to find subsequent matches, they
+ // will use the ending location from the previous match as the next
+ // starting point, which is `match_offset` bytes PRIOR to where
+ // we scanned to on the previous search. Therefore, we need to
+ // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
+ //
+ // Incidentally, since MATCH_OFFSET is non-zero, this also makes
+ // dealing with empty matches convenient. Namely, callers needn't
+ // special case them when implementing an iterator. Instead, this
+ // ensures that forward progress is always made.
+ start += MATCH_OFFSET;
+ sid
+ }
+ };
+
+ let mut at = start;
+ while at < end {
+ let byte = bytes[at];
+ sid = dfa.next_state(cache, sid, byte).map_err(|_| gave_up(at))?;
+ at += 1;
+ if sid.is_tagged() {
+ caller_state.set_id(sid);
+ if sid.is_start() {
+ if let Some(ref mut pre) = pre {
+ if pre.is_effective(at) {
+ match pre.next_candidate(bytes, at).into_option() {
+ None => return Ok(None),
+ Some(i) => {
+ at = i;
+ }
+ }
+ }
+ }
+ } else if sid.is_match() {
+ let offset = at - MATCH_OFFSET;
+ caller_state
+ .set_last_match(StateMatch { match_index: 1, offset });
+ return Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset,
+ }));
+ } else if sid.is_dead() {
+ return Ok(None);
+ } else {
+ debug_assert!(sid.is_quit());
+ return Err(MatchError::Quit { byte, offset: at - 1 });
+ }
+ }
+ }
+
+ let result = eoi_fwd(dfa, cache, bytes, end, &mut sid);
+ caller_state.set_id(sid);
+ if let Ok(Some(ref last_match)) = result {
+ caller_state.set_last_match(StateMatch {
+ // '1' is always correct here since if we get to this point, this
+ // always corresponds to the first (index '0') match discovered at
+ // this position. So the next match to report at this position (if
+ // it exists) is at index '1'.
+ match_index: 1,
+ offset: last_match.offset(),
+ });
+ }
+ result
+}
+
+#[inline(always)]
+fn init_fwd(
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<LazyStateID, MatchError> {
+ let sid = dfa
+ .start_state_forward(cache, pattern_id, bytes, start, end)
+ .map_err(|_| gave_up(start))?;
+ // Start states can never be match states, since all matches are delayed
+ // by 1 byte.
+ assert!(!sid.is_match());
+ Ok(sid)
+}
+
+#[inline(always)]
+fn init_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ pattern_id: Option<PatternID>,
+ bytes: &[u8],
+ start: usize,
+ end: usize,
+) -> Result<LazyStateID, MatchError> {
+ let sid = dfa
+ .start_state_reverse(cache, pattern_id, bytes, start, end)
+ .map_err(|_| gave_up(end))?;
+ // Start states can never be match states, since all matches are delayed
+ // by 1 byte.
+ assert!(!sid.is_match());
+ Ok(sid)
+}
+
+#[inline(always)]
+fn eoi_fwd(
+ dfa: &DFA,
+ cache: &mut Cache,
+ bytes: &[u8],
+ end: usize,
+ sid: &mut LazyStateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+ match bytes.get(end) {
+ Some(&b) => {
+ *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(end))?;
+ if sid.is_match() {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, *sid, 0),
+ offset: end,
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+ None => {
+ *sid = dfa
+ .next_eoi_state(cache, *sid)
+ .map_err(|_| gave_up(bytes.len()))?;
+ if sid.is_match() {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, *sid, 0),
+ offset: bytes.len(),
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+ }
+}
+
+#[inline(always)]
+fn eoi_rev(
+ dfa: &DFA,
+ cache: &mut Cache,
+ bytes: &[u8],
+ start: usize,
+ state: LazyStateID,
+) -> Result<Option<HalfMatch>, MatchError> {
+ if start > 0 {
+ let sid = dfa
+ .next_state(cache, state, bytes[start - 1])
+ .map_err(|_| gave_up(start))?;
+ if sid.is_match() {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset: start,
+ }))
+ } else {
+ Ok(None)
+ }
+ } else {
+ let sid =
+ dfa.next_eoi_state(cache, state).map_err(|_| gave_up(start))?;
+ if sid.is_match() {
+ Ok(Some(HalfMatch {
+ pattern: dfa.match_pattern(cache, sid, 0),
+ offset: 0,
+ }))
+ } else {
+ Ok(None)
+ }
+ }
+}
+
+/// A convenience routine for constructing a "gave up" match error.
+#[inline(always)]
+fn gave_up(offset: usize) -> MatchError {
+ MatchError::GaveUp { offset }
+}