diff options
Diffstat (limited to 'vendor/regex-automata/src/nfa/thompson/pikevm.rs')
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/pikevm.rs | 2409 |
1 files changed, 2107 insertions, 302 deletions
diff --git a/vendor/regex-automata/src/nfa/thompson/pikevm.rs b/vendor/regex-automata/src/nfa/thompson/pikevm.rs index 7572f9f10..0128c151a 100644 --- a/vendor/regex-automata/src/nfa/thompson/pikevm.rs +++ b/vendor/regex-automata/src/nfa/thompson/pikevm.rs @@ -1,18 +1,71 @@ -use alloc::{sync::Arc, vec, vec::Vec}; +/*! +An NFA backed Pike VM for executing regex searches with capturing groups. + +This module provides a [`PikeVM`] that works by simulating an NFA and +resolving all spans of capturing groups that participate in a match. +*/ + +#[cfg(feature = "internal-instrument-pikevm")] +use core::cell::RefCell; + +use alloc::{vec, vec::Vec}; use crate::{ - nfa::thompson::{self, Error, State, NFA}, + nfa::thompson::{self, BuildError, State, NFA}, util::{ - id::{PatternID, StateID}, - matchtypes::MultiMatch, + captures::Captures, + empty, iter, + prefilter::Prefilter, + primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, + search::{ + Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, + }, sparse_set::SparseSet, }, }; -#[derive(Clone, Copy, Debug, Default)] +/// A simple macro for conditionally executing instrumentation logic when +/// the 'trace' log level is enabled. This is a compile-time no-op when the +/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that +/// this makes it easier to avoid doing extra work when instrumentation isn't +/// enabled. +/// +/// This macro accepts a closure of type `|&mut Counters|`. The closure can +/// then increment counters (or whatever) in accordance with what one wants +/// to track. +macro_rules! instrument { + ($fun:expr) => { + #[cfg(feature = "internal-instrument-pikevm")] + { + let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; + COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut())); + } + }; +} + +#[cfg(feature = "internal-instrument-pikevm")] +std::thread_local! { + /// Effectively global state used to keep track of instrumentation + /// counters. The "proper" way to do this is to thread it through the + /// PikeVM, but it makes the code quite icky. Since this is just a + /// debugging feature, we're content to relegate it to thread local + /// state. When instrumentation is enabled, the counters are reset at the + /// beginning of every search and printed (with the 'trace' log level) at + /// the end of every search. + static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty()); +} + +/// The configuration used for building a [`PikeVM`]. +/// +/// A PikeVM configuration is a simple data object that is typically used with +/// [`Builder::configure`]. It can be cheaply cloned. +/// +/// A default configuration can be created either with `Config::new`, or +/// perhaps more conveniently, with [`PikeVM::config`]. +#[derive(Clone, Debug, Default)] pub struct Config { - anchored: Option<bool>, - utf8: Option<bool>, + match_kind: Option<MatchKind>, + pre: Option<Option<Prefilter>>, } impl Config { @@ -21,37 +74,172 @@ impl Config { Config::default() } - pub fn anchored(mut self, yes: bool) -> Config { - self.anchored = Some(yes); + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to "classical DFA" construction + /// where all possible matches are visited in the NFA by the `PikeVM`. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` + /// simulating dead states as a way to terminate the search and report a + /// match. `LeftmostFirst` also supports non-greedy matches using this + /// strategy where as `All` does not. + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); self } - pub fn utf8(mut self, yes: bool) -> Config { - self.utf8 = Some(yes); + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::prefilter::Prefilter, + /// Input, Match, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// // No match reported even though there clearly is one! + /// assert_eq!(None, re.find(&mut cache, input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); self } - pub fn get_anchored(&self) -> bool { - self.anchored.unwrap_or(false) + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } - pub fn get_utf8(&self) -> bool { - self.utf8.unwrap_or(true) + /// Returns the prefilter set in this configuration, if one at all. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() } - pub(crate) fn overwrite(self, o: Config) -> Config { + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { Config { - anchored: o.anchored.or(self.anchored), - utf8: o.utf8.or(self.utf8), + match_kind: o.match_kind.or(self.match_kind), + pre: o.pre.or_else(|| self.pre.clone()), } } } -/// A builder for a PikeVM. +/// A builder for a `PikeVM`. +/// +/// This builder permits configuring options for the syntax of a pattern, +/// the NFA construction and the `PikeVM` construction. This builder is +/// different from a general purpose regex builder in that it permits fine +/// grain configuration of the construction process. The trade off for this is +/// complexity, and the possibility of setting a configuration that might not +/// make sense. For example, there are two different UTF-8 modes: +/// +/// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) +/// controls whether the pattern itself can contain sub-expressions that match +/// invalid UTF-8. +/// * [`thompson::Config::utf8`] controls whether empty matches that split a +/// Unicode codepoint are reported or not. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{self, pikevm::PikeVM}, +/// util::syntax, +/// Match, +/// }; +/// +/// let re = PikeVM::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find_iter(&mut cache, haystack).next(); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on a PikeVM Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, - thompson: thompson::Builder, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, } impl Builder { @@ -59,53 +247,58 @@ impl Builder { pub fn new() -> Builder { Builder { config: Config::default(), - thompson: thompson::Builder::new(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), } } - pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> { + /// Build a `PikeVM` from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> { self.build_many(&[pattern]) } + /// Build a `PikeVM` from the given patterns. + #[cfg(feature = "syntax")] pub fn build_many<P: AsRef<str>>( &self, patterns: &[P], - ) -> Result<PikeVM, Error> { + ) -> Result<PikeVM, BuildError> { let nfa = self.thompson.build_many(patterns)?; - self.build_from_nfa(Arc::new(nfa)) - } - - pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> { - // TODO: Check that this is correct. - // if !cfg!(all( - // feature = "dfa", - // feature = "syntax", - // feature = "unicode-perl" - // )) { - if !cfg!(feature = "syntax") { - if nfa.has_word_boundary_unicode() { - return Err(Error::unicode_word_unavailable()); - } - } - Ok(PikeVM { config: self.config, nfa }) + self.build_from_nfa(nfa) + } + + /// Build a `PikeVM` directly from its NFA. + /// + /// Note that when using this method, any configuration that applies to the + /// construction of the NFA itself will of course be ignored, since the NFA + /// given here is already built. + pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> { + nfa.look_set_any().available().map_err(BuildError::word)?; + Ok(PikeVM { config: self.config.clone(), nfa }) } + /// Apply the given `PikeVM` configuration options to this builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using - /// [`SyntaxConfig`](crate::SyntaxConfig). + /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a PikeVM directly from a /// pattern. + #[cfg(feature = "syntax")] pub fn syntax( &mut self, - config: crate::util::syntax::SyntaxConfig, + config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self @@ -119,259 +312,1395 @@ impl Builder { /// /// These settings only apply when constructing a PikeVM directly from a /// pattern. + #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } +/// A virtual machine for executing regex searches with capturing groups. +/// +/// # Infallible APIs +/// +/// Unlike most other regex engines in this crate, a `PikeVM` never returns an +/// error at search time. It supports all [`Anchored`] configurations, never +/// quits and works on haystacks of arbitrary length. +/// +/// There are two caveats to mention though: +/// +/// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], +/// then the PikeVM will report "no match." This is consistent with all other +/// regex engines in this crate. +/// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] +/// that has insufficient capacity to store all valid pattern IDs, then if a +/// match occurs for a `PatternID` that cannot be inserted, it is silently +/// dropped as if it did not match. +/// +/// # Advice +/// +/// The `PikeVM` is generally the most "powerful" regex engine in this crate. +/// "Powerful" in this context means that it can handle any regular expression +/// that is parseable by `regex-syntax` and any size haystack. Regretably, +/// the `PikeVM` is also simultaneously often the _slowest_ regex engine in +/// practice. This results in an annoying situation where one generally tries +/// to pick any other regex engine (or perhaps none at all) before being +/// forced to fall back to a `PikeVM`. +/// +/// For example, a common strategy for dealing with capturing groups is to +/// actually look for the overall match of the regex using a faster regex +/// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall +/// match is found, one can then run the `PikeVM` on just the match span to +/// find the spans of the capturing groups. In this way, the faster regex +/// engine does the majority of the work, while the `PikeVM` only lends its +/// power in a more limited role. +/// +/// Unfortunately, this isn't always possible because the faster regex engines +/// don't support all of the regex features in `regex-syntax`. This notably +/// includes (and is currently limited to) Unicode word boundaries. So if +/// your pattern has Unicode word boundaries, you typically can't use a +/// DFA-based regex engine at all (unless you [enable heuristic support for +/// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass +/// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for +/// anchored searches only, but in a cruel sort of joke, many Unicode features +/// tend to result in making the regex _not_ one-pass.) +/// +/// # Example +/// +/// This example shows that the `PikeVM` implements Unicode word boundaries +/// correctly by default. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; +/// +/// let re = PikeVM::new(r"\b\w+\b")?; +/// let mut cache = re.create_cache(); +/// +/// let mut it = re.find_iter(&mut cache, "Шерлок Холмс"); +/// assert_eq!(Some(Match::must(0, 0..12)), it.next()); +/// assert_eq!(Some(Match::must(0, 13..23)), it.next()); +/// assert_eq!(None, it.next()); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` #[derive(Clone, Debug)] pub struct PikeVM { config: Config, - nfa: Arc<NFA>, + nfa: NFA, } impl PikeVM { - pub fn new(pattern: &str) -> Result<PikeVM, Error> { + /// Parse the given regular expression using the default configuration and + /// return the corresponding `PikeVM`. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 3..14)), + /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<PikeVM, BuildError> { PikeVM::builder().build(pattern) } - pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> { + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<PikeVM, BuildError> { PikeVM::builder().build_many(patterns) } + /// Like `new`, but builds a PikeVM directly from an NFA. This is useful + /// if you already have an NFA, or even if you hand-assembled the NFA. + /// + /// # Example + /// + /// This shows how to hand assemble a regular expression via its HIR, + /// compile an NFA from it and build a PikeVM from the NFA. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let expected = Some(Match::must(0, 3..4)); + /// re.captures(&mut cache, "!@#A#@!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> { + PikeVM::builder().build_from_nfa(nfa) + } + + /// Create a new `PikeVM` that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::always_match()?; + /// let mut cache = re.create_cache(); + /// + /// let expected = Match::must(0, 0..0); + /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next()); + /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<PikeVM, BuildError> { + let nfa = thompson::NFA::always_match(); + PikeVM::new_from_nfa(nfa) + } + + /// Create a new `PikeVM` that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::never_match()?; + /// let mut cache = re.create_cache(); + /// + /// assert_eq!(None, re.find_iter(&mut cache, "").next()); + /// assert_eq!(None, re.find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<PikeVM, BuildError> { + let nfa = thompson::NFA::never_match(); + PikeVM::new_from_nfa(nfa) + } + + /// Return a default configuration for a `PikeVM`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a `PikeVM`. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode. When UTF-8 mode is + /// disabled, zero-width matches that split a codepoint are allowed. + /// Otherwise they are never reported. + /// + /// In the code below, notice that `""` is permitted to match positions + /// that split the encoding of a codepoint. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; + /// + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z"; + /// let mut it = re.find_iter(&mut cache, haystack); + /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); + /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); + /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); + /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); + /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); + /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn config() -> Config { Config::new() } + /// Return a builder for configuring the construction of a `PikeVM`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = PikeVM::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 1..9)); + /// re.captures(&mut cache, haystack, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn builder() -> Builder { Builder::new() } + /// Create a new empty set of capturing groups that is guaranteed to be + /// valid for the search APIs on this `PikeVM`. + /// + /// A `Captures` value created for a specific `PikeVM` cannot be used with + /// any other `PikeVM`. + /// + /// This is a convenience function for [`Captures::all`]. See the + /// [`Captures`] documentation for an explanation of its alternative + /// constructors that permit the `PikeVM` to do less work during a search, + /// and thus might make it faster. + pub fn create_captures(&self) -> Captures { + Captures::all(self.get_nfa().group_info().clone()) + } + + /// Create a new cache for this `PikeVM`. + /// + /// The cache returned should only be used for searches for this + /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then + /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, + /// [`PikeVM::reset_cache`]). pub fn create_cache(&self) -> Cache { - Cache::new(self.nfa()) + Cache::new(self) } - pub fn create_captures(&self) -> Captures { - Captures::new(self.nfa()) + /// Reset the given cache such that it can be used for searching with the + /// this `PikeVM` (and only this `PikeVM`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `PikeVM`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `PikeVM`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re1 = PikeVM::new(r"\w")?; + /// let re2 = PikeVM::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the PikeVM we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + cache.reset(self); } - pub fn nfa(&self) -> &Arc<NFA> { + /// Returns the total number of patterns compiled into this `PikeVM`. + /// + /// In the case of a `PikeVM` that contains no patterns, this returns `0`. + /// + /// # Example + /// + /// This example shows the pattern length for a `PikeVM` that never + /// matches: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::never_match()?; + /// assert_eq!(re.pattern_len(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a `PikeVM` that matches at every position: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::always_match()?; + /// assert_eq!(re.pattern_len(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a `PikeVM` that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(re.pattern_len(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + self.nfa.pattern_len() + } + + /// Return the config for this `PikeVM`. + #[inline] + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Returns a reference to the underlying NFA. + #[inline] + pub fn get_nfa(&self) -> &NFA { &self.nfa } +} - pub fn find_leftmost_iter<'r, 'c, 't>( +impl PikeVM { + /// Returns true if and only if this `PikeVM` matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future + /// input will never lead to a different result. In particular, if the + /// underlying NFA enters a match state, then this routine will return + /// `true` immediately without inspecting any future input. (Consider how + /// this might make a difference given the regex `a+` on the haystack + /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, + /// but routines like `find` need to continue searching because `+` is + /// greedy by default.) + /// + /// # Example + /// + /// This shows basic usage: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "foo12345bar")); + /// assert!(!re.is_match(&mut cache, "foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: consistency with search APIs + /// + /// `is_match` is guaranteed to return `true` whenever `find` returns a + /// match. This includes searches that are executed entirely within a + /// codepoint: + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; + /// + /// let re = PikeVM::new("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Notice that when UTF-8 mode is disabled, then the above reports a + /// match because the restriction against zero-width matches that split a + /// codepoint has been lifted: + /// + /// ``` + /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().utf8(false)) + /// .build("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> bool { + let input = input.into().earliest(true); + self.search_slots(cache, &input, &mut []).is_some() + } + + /// Executes a leftmost forward search and returns a `Match` if one exists. + /// + /// This routine only includes the overall match span. To get access to the + /// individual spans of each capturing group, use [`PikeVM::captures`]. + /// + /// # Example + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..8); + /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over later parts. + /// let re = PikeVM::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..3); + /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Option<Match> { + let input = input.into(); + if self.get_nfa().pattern_len() == 1 { + let mut slots = [None, None]; + let pid = self.search_slots(cache, &input, &mut slots)?; + let start = slots[0]?.get(); + let end = slots[1]?.get(); + return Some(Match::new(pid, Span { start, end })); + } + let ginfo = self.get_nfa().group_info(); + let slots_len = ginfo.implicit_slot_len(); + let mut slots = vec![None; slots_len]; + let pid = self.search_slots(cache, &input, &mut slots)?; + let start = slots[pid.as_usize() * 2]?.get(); + let end = slots[pid.as_usize() * 2 + 1]?.get(); + Some(Match::new(pid, Span { start, end })) + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "2010-03-14", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); + /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + caps: &mut Captures, + ) { + self.search(cache, &input.into(), caps) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> FindMatches<'r, 'c, 'h> { + let caps = Captures::matches(self.get_nfa().group_info().clone()); + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, cache, caps, it } + } + + /// Returns an iterator over all non-overlapping `Captures` values. If no + /// match exists, then the iterator yields no elements. + /// + /// This yields the same matches as [`PikeVM::find_iter`], but it includes + /// the spans of all capturing groups that participate in each match. + /// + /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for + /// how to correctly iterate over all matches in a haystack while avoiding + /// the creation of a new `Captures` value for every match. (Which you are + /// forced to do with an `Iterator`.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Span> = re + /// .captures_iter(&mut cache, text) + /// // The unwrap is OK since 'numbers' matches if the pattern matches. + /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) + /// .collect(); + /// assert_eq!(matches, vec![ + /// Span::from(3..4), + /// Span::from(8..10), + /// Span::from(14..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, - haystack: &'t [u8], - ) -> FindLeftmostMatches<'r, 'c, 't> { - FindLeftmostMatches::new(self, cache, haystack) - } - - // BREADCRUMBS: - // - // 1) Don't forget about prefilters. - // - // 2) Consider the case of using a PikeVM with an NFA that has Capture - // states, but where we don't want to track capturing groups (other than - // group 0). This potentially saves a lot of copying around and what not. I - // believe the current regex crate does this, for example. The interesting - // bit here is how to handle the case of multiple patterns... - // - // 3) Permit the caller to specify a pattern ID to run an anchored-only - // search on. - // - // 4) How to do overlapping? The way multi-regex support works in the regex - // crate currently is to run the PikeVM until either we reach the end of - // the haystack or when we know all regexes have matched. The latter case - // is probably quite rare, so the common case is likely that we're always - // searching the entire input. The question is: can we emulate that with - // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then - // all we need to do is provide an overlapping API on the PikeVM that - // roughly matches the ones we provide on DFAs. For those APIs, the only - // thing they need over non-overlapping APIs is "caller state." For DFAs, - // the caller state is simple: it contains the last state visited and the - // last match reported. For the PikeVM (and NFAs in general), the "last - // state" is actually a *set* of NFA states. So I think what happens here - // is that we can just force the `Cache` to subsume this role. We'll still - // need some additional state to track the last match reported though. - // Because when two or more patterns match at the same location, we need a - // way to know to iterate over them. Although maybe it's not match index we - // need, but the state index of the last NFA state processed in the cache. - // Then we just pick up where we left off. There might be another match - // state, in which case, we report it. - - pub fn find_leftmost_at( + input: I, + ) -> CapturesMatches<'r, 'c, 'h> { + let caps = self.create_captures(); + let it = iter::Searcher::new(input.into()); + CapturesMatches { re: self, cache, caps, it } + } +} + +impl PikeVM { + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` + /// instead of an `Into<Input>`. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-PikeVM that permits searching + /// for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Anchored, Match, PatternID, Input, + /// }; + /// + /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123"; + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(Match::must(0, 0..6)); + /// re.search(&mut cache, &Input::new(haystack), &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(Match::must(1, 0..6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; + /// + /// let re = PikeVM::new(r"\b[0-9]{3}\b")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let input = Input::new(haystack).range(3..6); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search( &self, cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, + input: &Input<'_>, caps: &mut Captures, - ) -> Option<MultiMatch> { - let anchored = - self.config.get_anchored() || self.nfa.is_always_start_anchored(); - let mut at = start; - let mut matched_pid = None; - cache.clear(); - 'LOOP: loop { - if cache.clist.set.is_empty() { - if matched_pid.is_some() || (anchored && at > start) { - break 'LOOP; + ) { + caps.set_pattern(None); + let pid = self.search_slots(cache, input, caps.slots_mut()); + caps.set_pattern(pid); + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided `slots`, and + /// returns the matching pattern ID. The contents of the slots for patterns + /// other than the matching pattern are unspecified. If no match was found, + /// then `None` is returned and the contents of `slots` is unspecified. + /// + /// This is like [`PikeVM::search`], but it accepts a raw slots slice + /// instead of a `Captures` value. This is useful in contexts where you + /// don't want or need to allocate a `Captures`. + /// + /// It is legal to pass _any_ number of slots to this routine. If the regex + /// engine would otherwise write a slot offset that doesn't fit in the + /// provided slice, then it is simply skipped. In general though, there are + /// usually three slice lengths you might want to use: + /// + /// * An empty slice, if you only care about which pattern matched. + /// * A slice with + /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) + /// slots, if you only care about the overall match spans for each matching + /// pattern. + /// * A slice with + /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which + /// permits recording match offsets for every capturing group in every + /// pattern. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; + /// + /// let re = PikeVM::new_many(&[ + /// r"\pL+", + /// r"\d+", + /// ])?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("!@#123"); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.search_slots(&mut cache, &input, &mut slots); + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + if !utf8empty { + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); + } + // There is an unfortunate special case where if the regex can + // match the empty string and UTF-8 mode is enabled, the search + // implementation requires that the slots have at least as much space + // to report the bounds of any match. This is so zero-width matches + // that split a codepoint can be filtered out. + // + // Note that if utf8empty is true, we specialize the case for when + // the number of patterns is 1. In that case, we can just use a stack + // allocation. Otherwise we resort to a heap allocation, which we + // convince ourselves we're fine with due to the pathological nature of + // this case. + let min = self.get_nfa().group_info().implicit_slot_len(); + if slots.len() >= min { + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); + } + if self.get_nfa().pattern_len() == 1 { + let mut enough = [None, None]; + let got = self.search_slots_imp(cache, input, &mut enough); + // This is OK because we know `enough` is strictly bigger than + // `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + return got.map(|hm| hm.pattern()); + } + let mut enough = vec![None; min]; + let got = self.search_slots_imp(cache, input, &mut enough); + // This is OK because we know `enough` is strictly bigger than `slots`, + // otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + got.map(|hm| hm.pattern()) + } + + /// This is the actual implementation of `search_slots_imp` that + /// doesn't account for the special case when 1) the NFA has UTF-8 mode + /// enabled, 2) the NFA can match the empty string and 3) the caller has + /// provided an insufficient number of slots to record match offsets. + #[inline(never)] + fn search_slots_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match self.search_imp(cache, input, slots) { + None => return None, + Some(hm) if !utf8empty => return Some(hm), + Some(hm) => hm, + }; + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots) + .map(|hm| (hm, hm.offset()))) + }) + // OK because the PikeVM never errors. + .unwrap() + } + + /// Writes the set of patterns that match anywhere in the given search + /// configuration to `patset`. If multiple patterns match at the same + /// position and this `PikeVM` was configured with [`MatchKind::All`] + /// semantics, then all matching patterns are written to the given set. + /// + /// Unless all of the patterns in this `PikeVM` are anchored, then + /// generally speaking, this will visit every byte in the haystack. + /// + /// This search routine *does not* clear the pattern set. This gives some + /// flexibility to the caller (e.g., running multiple searches with the + /// same pattern set), but does make the API bug-prone if you're reusing + /// the same pattern set for multiple searches but intended them to be + /// independent. + /// + /// If a pattern ID matched but the given `PatternSet` does not have + /// sufficient capacity to store it, then it is not inserted and silently + /// dropped. + /// + /// # Example + /// + /// This example shows how to find all matching patterns in a haystack, + /// even when some patterns match at the same position as other patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Input, MatchKind, PatternSet, + /// }; + /// + /// let patterns = &[ + /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", + /// ]; + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(re.pattern_len()); + /// re.which_overlapping_matches(&mut cache, &input, &mut patset); + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + self.which_overlapping_imp(cache, input, patset) + } +} + +impl PikeVM { + /// The implementation of standard leftmost search. + /// + /// Capturing group spans are written to `slots`, but only if requested. + /// `slots` can be any length. Any slot in the NFA that is activated but + /// which is out of bounds for the given `slots` is ignored. + fn search_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + cache.setup_search(slots.len()); + if input.is_done() { + return None; + } + // Why do we even care about this? Well, in our 'Captures' + // representation, we use usize::MAX as a sentinel to indicate "no + // match." This isn't problematic so long as our haystack doesn't have + // a maximal length. Byte slices are guaranteed by Rust to have a + // length that fits into isize, and so this assert should always pass. + // But we put it here to make our assumption explicit. + assert!( + input.haystack().len() < core::usize::MAX, + "byte slice lengths must be less than usize MAX", + ); + instrument!(|c| c.reset(&self.nfa)); + + // Whether we want to visit all match states instead of emulating the + // 'leftmost' semantics of typical backtracking regex engines. + let allmatches = + self.config.get_match_kind().continue_past_first_match(); + let (anchored, start_id) = match self.start_config(input) { + None => return None, + Some(config) => config, + }; + + let pre = + if anchored { None } else { self.get_config().get_prefilter() }; + let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let mut hm = None; + // Yes, our search doesn't end at input.end(), but includes it. This + // is necessary because matches are delayed by one byte, just like + // how the DFA engines work. The delay is used to handle look-behind + // assertions. In the case of the PikeVM, the delay is implemented + // by not considering a match to exist until it is visited in + // 'steps'. Technically, we know a match exists in the previous + // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA + // determinization. We don't mark a DFA state as a match state if it + // contains an NFA match state, but rather, whether the DFA state was + // generated by a transition from a DFA state that contains an NFA + // match state.) + let mut at = input.start(); + while at <= input.end() { + // If we have no states left to visit, then there are some cases + // where we know we can quit early or even skip ahead. + if curr.set.is_empty() { + // We have a match and we haven't been instructed to continue + // on even after finding a match, so we can quit. + if hm.is_some() && !allmatches { + break; + } + // If we're running an anchored search and we've advanced + // beyond the start position with no other states to try, then + // we will never observe a match and thus can stop. + if anchored && at > input.start() { + break; + } + // If there no states left to explore at this position and we + // know we can't terminate early, then we are effectively at + // the starting state of the NFA. If we fell through here, + // we'd end up adding our '(?s-u:.)*?' prefix and it would be + // the only thing in 'curr'. So we might as well just skip + // ahead until we find something that we know might advance us + // forward. + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => break, + Some(ref span) => at = span.start, + } } - // TODO: prefilter } - if (!anchored && matched_pid.is_none()) - || cache.clist.set.is_empty() + // Instead of using the NFA's unanchored start state, we actually + // always use its anchored starting state. As a result, when doing + // an unanchored search, we need to simulate our own '(?s-u:.)*?' + // prefix, to permit a match to appear anywhere. + // + // Now, we don't *have* to do things this way. We could use the + // NFA's unanchored starting state and do one 'epsilon_closure' + // call from that starting state before the main loop here. And + // that is just as correct. However, it turns out to be slower + // than our approach here because it slightly increases the cost + // of processing each byte by requiring us to visit more NFA + // states to deal with the additional NFA states in the unanchored + // prefix. By simulating it explicitly here, we lower those costs + // substantially. The cost is itself small, but it adds up for + // large haystacks. + // + // In order to simulate the '(?s-u:.)*?' prefix---which is not + // greedy---we are careful not to perform an epsilon closure on + // the start state if we already have a match. Namely, if we + // did otherwise, we would never reach a terminating condition + // because there would always be additional states to process. + // In effect, the exclusion of running 'epsilon_closure' when + // we have a match corresponds to the "dead" states we have in + // our DFA regex engines. Namely, in a DFA, match states merely + // instruct the search execution to record the current offset as + // the most recently seen match. It is the dead state that actually + // indicates when to stop the search (other than EOF or quit + // states). + // + // However, when 'allmatches' is true, the caller has asked us to + // leave in every possible match state. This tends not to make a + // whole lot of sense in unanchored searches, because it means the + // search really cannot terminate until EOF. And often, in that + // case, you wind up skipping over a bunch of matches and are left + // with the "last" match. Arguably, it just doesn't make a lot of + // sense to run a 'leftmost' search (which is what this routine is) + // with 'allmatches' set to true. But the DFAs support it and this + // matches their behavior. (Generally, 'allmatches' is useful for + // overlapping searches or leftmost anchored searches to find the + // longest possible match by ignoring match priority.) + // + // Additionally, when we're running an anchored search, this + // epsilon closure should only be computed at the beginning of the + // search. If we re-computed it at every position, we would be + // simulating an unanchored search when we were tasked to perform + // an anchored search. + if (!hm.is_some() || allmatches) + && (!anchored || at == input.start()) { - self.epsilon_closure( - &mut cache.clist, - &mut caps.slots, - &mut cache.stack, - self.nfa.start_anchored(), - haystack, - at, - ); + // Since we are adding to the 'curr' active states and since + // this is for the start ID, we use a slots slice that is + // guaranteed to have the right length but where every element + // is absent. This is exactly what we want, because this + // epsilon closure is responsible for simulating an unanchored + // '(?s:.)*?' prefix. It is specifically outside of any + // capturing groups, and thus, using slots that are always + // absent is correct. + // + // Note though that we can't just use '&mut []' here, since + // this epsilon closure may traverse through 'Captures' epsilon + // transitions, and thus must be able to write offsets to the + // slots given which are later copied to slot values in 'curr'. + let slots = next.slot_table.all_absent(); + self.epsilon_closure(stack, slots, curr, input, at, start_id); } - for i in 0..cache.clist.set.len() { - let sid = cache.clist.set.get(i); - let pid = match self.step( - &mut cache.nlist, - &mut caps.slots, - cache.clist.caps(sid), - &mut cache.stack, - sid, - haystack, - at, - ) { - None => continue, - Some(pid) => pid, - }; - matched_pid = Some(pid); - break; + if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + { + hm = Some(HalfMatch::new(pid, at)); } - if at >= end { + // Unless the caller asked us to return early, we need to mush on + // to see if we can extend our match. (But note that 'nexts' will + // quit right after seeing a match when match_kind==LeftmostFirst, + // as is consistent with leftmost-first match priority.) + if input.get_earliest() && hm.is_some() { break; } + core::mem::swap(curr, next); + next.set.clear(); at += 1; - cache.swap(); - cache.nlist.set.clear(); } - matched_pid.map(|pid| { - let slots = self.nfa.pattern_slots(pid); - let (start, end) = (slots.start, slots.start + 1); - MultiMatch::new( - pid, - caps.slots[start].unwrap(), - caps.slots[end].unwrap(), - ) - }) + instrument!(|c| c.eprint(&self.nfa)); + hm + } + + /// The implementation for the 'which_overlapping_matches' API. Basically, + /// we do a single scan through the entire haystack (unless our regex + /// or search is anchored) and record every pattern that matched. In + /// particular, when MatchKind::All is used, this supports overlapping + /// matches. So if we have the regexes 'sam' and 'samwise', they will + /// *both* be reported in the pattern set when searching the haystack + /// 'samwise'. + fn which_overlapping_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + // NOTE: This is effectively a copy of 'search_imp' above, but with no + // captures support and instead writes patterns that matched directly + // to 'patset'. See that routine for better commentary about what's + // going on in this routine. We probably could unify the routines using + // generics or more helper routines, but I'm not sure it's worth it. + // + // NOTE: We somewhat go out of our way here to support things like + // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither + // of those seem particularly relevant to this routine, but they are + // both supported by the DFA analogs of this routine by construction + // and composition, so it seems like good sense to have the PikeVM + // match that behavior. + + cache.setup_search(0); + if input.is_done() { + return; + } + assert!( + input.haystack().len() < core::usize::MAX, + "byte slice lengths must be less than usize MAX", + ); + instrument!(|c| c.reset(&self.nfa)); + + let allmatches = + self.config.get_match_kind().continue_past_first_match(); + let (anchored, start_id) = match self.start_config(input) { + None => return, + Some(config) => config, + }; + + let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + for at in input.start()..=input.end() { + let any_matches = !patset.is_empty(); + if curr.set.is_empty() { + if any_matches && !allmatches { + break; + } + if anchored && at > input.start() { + break; + } + } + if !any_matches || allmatches { + let slots = &mut []; + self.epsilon_closure(stack, slots, curr, input, at, start_id); + } + self.nexts_overlapping(stack, curr, next, input, at, patset); + // If we found a match and filled our set, then there is no more + // additional info that we can provide. Thus, we can quit. We also + // quit if the caller asked us to stop at the earliest point that + // we know a match exists. + if patset.is_full() || input.get_earliest() { + break; + } + core::mem::swap(curr, next); + next.set.clear(); + } + instrument!(|c| c.eprint(&self.nfa)); + } + + /// Process the active states in 'curr' to find the states (written to + /// 'next') we should process for the next byte in the haystack. + /// + /// 'stack' is used to perform a depth first traversal of the NFA when + /// computing an epsilon closure. + /// + /// When a match is found, the slots for that match state (in 'curr') are + /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' + /// stops (unless the PikeVM was configured with MatchKind::All semantics). + #[cfg_attr(feature = "perf-inline", inline(always))] + fn nexts( + &self, + stack: &mut Vec<FollowEpsilon>, + curr: &mut ActiveStates, + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + instrument!(|c| c.record_state_set(&curr.set)); + let mut pid = None; + let ActiveStates { ref set, ref mut slot_table } = *curr; + for sid in set.iter() { + pid = match self.next(stack, slot_table, next, input, at, sid) { + None => continue, + Some(pid) => Some(pid), + }; + slots.copy_from_slice(slot_table.for_state(sid)); + if !self.config.get_match_kind().continue_past_first_match() { + break; + } + } + pid } - #[inline(always)] - fn step( + /// Like 'nexts', but for the overlapping case. This doesn't write any + /// slots, and instead just writes which pattern matched in 'patset'. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn nexts_overlapping( &self, - nlist: &mut Threads, - slots: &mut [Slot], - thread_caps: &mut [Slot], stack: &mut Vec<FollowEpsilon>, - sid: StateID, - haystack: &[u8], + curr: &mut ActiveStates, + next: &mut ActiveStates, + input: &Input<'_>, at: usize, + patset: &mut PatternSet, + ) { + instrument!(|c| c.record_state_set(&curr.set)); + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let ActiveStates { ref set, ref mut slot_table } = *curr; + for sid in set.iter() { + let pid = match self.next(stack, slot_table, next, input, at, sid) + { + None => continue, + Some(pid) => pid, + }; + // This handles the case of finding a zero-width match that splits + // a codepoint. Namely, if we're in UTF-8 mode AND we know we can + // match the empty string, then the only valid way of getting to + // this point with an offset that splits a codepoint is when we + // have an empty match. Such matches, in UTF-8 mode, must not be + // reported. So we just skip them here and pretend as if we did + // not see a match. + if utf8empty && !input.is_char_boundary(at) { + continue; + } + let _ = patset.try_insert(pid); + if !self.config.get_match_kind().continue_past_first_match() { + break; + } + } + } + + /// Starting from 'sid', if the position 'at' in the 'input' haystack has a + /// transition defined out of 'sid', then add the state transitioned to and + /// its epsilon closure to the 'next' set of states to explore. + /// + /// 'stack' is used by the epsilon closure computation to perform a depth + /// first traversal of the NFA. + /// + /// 'curr_slot_table' should be the table of slots for the current set of + /// states being explored. If there is a transition out of 'sid', then + /// sid's row in the slot table is used to perform the epsilon closure. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next( + &self, + stack: &mut Vec<FollowEpsilon>, + curr_slot_table: &mut SlotTable, + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + sid: StateID, ) -> Option<PatternID> { + instrument!(|c| c.record_step(sid)); match *self.nfa.state(sid) { State::Fail | State::Look { .. } | State::Union { .. } + | State::BinaryUnion { .. } | State::Capture { .. } => None, - State::Range { ref range } => { - if range.matches(haystack, at) { + State::ByteRange { ref trans } => { + if trans.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); self.epsilon_closure( - nlist, - thread_caps, - stack, - range.next, - haystack, - at + 1, + stack, slots, next, input, at, trans.next, ); } None } State::Sparse(ref sparse) => { - if let Some(next) = sparse.matches(haystack, at) { + if let Some(next_sid) = sparse.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); self.epsilon_closure( - nlist, - thread_caps, - stack, - next, - haystack, - at + 1, + stack, slots, next, input, at, next_sid, ); } None } - State::Match { id } => { - slots.copy_from_slice(thread_caps); - Some(id) + State::Dense(ref dense) => { + if let Some(next_sid) = dense.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); + self.epsilon_closure( + stack, slots, next, input, at, next_sid, + ); + } + None } + State::Match { pattern_id } => Some(pattern_id), } } - #[inline(always)] + /// Compute the epsilon closure of 'sid', writing the closure into 'next' + /// while copying slot values from 'curr_slots' into corresponding states + /// in 'next'. 'curr_slots' should be the slot values corresponding to + /// 'sid'. + /// + /// The given 'stack' is used to perform a depth first traversal of the + /// NFA by recursively following all epsilon transitions out of 'sid'. + /// Conditional epsilon transitions are followed if and only if they are + /// satisfied for the position 'at' in the 'input' haystack. + /// + /// While this routine may write to 'curr_slots', once it returns, any + /// writes are undone and the original values (even if absent) are + /// restored. + #[cfg_attr(feature = "perf-inline", inline(always))] fn epsilon_closure( &self, - nlist: &mut Threads, - thread_caps: &mut [Slot], stack: &mut Vec<FollowEpsilon>, - sid: StateID, - haystack: &[u8], + curr_slots: &mut [Option<NonMaxUsize>], + next: &mut ActiveStates, + input: &Input<'_>, at: usize, + sid: StateID, ) { - stack.push(FollowEpsilon::StateID(sid)); + instrument!(|c| { + c.record_closure(sid); + c.record_stack_push(sid); + }); + stack.push(FollowEpsilon::Explore(sid)); while let Some(frame) = stack.pop() { match frame { - FollowEpsilon::StateID(sid) => { - self.epsilon_closure_step( - nlist, - thread_caps, - stack, - sid, - haystack, - at, - ); + FollowEpsilon::RestoreCapture { slot, offset: pos } => { + curr_slots[slot] = pos; } - FollowEpsilon::Capture { slot, pos } => { - thread_caps[slot] = pos; + FollowEpsilon::Explore(sid) => { + self.epsilon_closure_explore( + stack, curr_slots, next, input, at, sid, + ); } } } } - #[inline(always)] - fn epsilon_closure_step( + /// Explore all of the epsilon transitions out of 'sid'. This is mostly + /// split out from 'epsilon_closure' in order to clearly delineate + /// the actual work of computing an epsilon closure from the stack + /// book-keeping. + /// + /// This will push any additional explorations needed on to 'stack'. + /// + /// 'curr_slots' should refer to the slots for the currently active NFA + /// state. That is, the current state we are stepping through. These + /// slots are mutated in place as new 'Captures' states are traversed + /// during epsilon closure, but the slots are restored to their original + /// values once the full epsilon closure is completed. The ultimate use of + /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that + /// the capturing group spans are forwarded from the currently active state + /// to the next. + /// + /// 'next' refers to the next set of active states. Computing an epsilon + /// closure may increase the next set of active states. + /// + /// 'input' refers to the caller's input configuration and 'at' refers to + /// the current position in the haystack. These are used to check whether + /// conditional epsilon transitions (like look-around) are satisfied at + /// the current position. If they aren't, then the epsilon closure won't + /// include them. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn epsilon_closure_explore( &self, - nlist: &mut Threads, - thread_caps: &mut [Slot], stack: &mut Vec<FollowEpsilon>, - mut sid: StateID, - haystack: &[u8], + curr_slots: &mut [Option<NonMaxUsize>], + next: &mut ActiveStates, + input: &Input<'_>, at: usize, + mut sid: StateID, ) { + // We can avoid pushing some state IDs on to our stack in precisely + // the cases where a 'push(x)' would be immediately followed by a 'x + // = pop()'. This is achieved by this outer-loop. We simply set 'sid' + // to be the next state ID we want to explore once we're done with + // our initial exploration. In practice, this avoids a lot of stack + // thrashing. loop { - if !nlist.set.insert(sid) { + instrument!(|c| c.record_set_insert(sid)); + // Record this state as part of our next set of active states. If + // we've already explored it, then no need to do it again. + if !next.set.insert(sid) { return; } match *self.nfa.state(sid) { State::Fail - | State::Range { .. } + | State::Match { .. } + | State::ByteRange { .. } | State::Sparse { .. } - | State::Match { .. } => { - let t = &mut nlist.caps(sid); - t.copy_from_slice(thread_caps); + | State::Dense { .. } => { + next.slot_table.for_state(sid).copy_from_slice(curr_slots); return; } State::Look { look, next } => { - if !look.matches(haystack, at) { + // OK because we don't permit building a searcher with a + // Unicode word boundary if the requisite Unicode data is + // unavailable. + if !self.nfa.look_matcher().matches_inline( + look, + input.haystack(), + at, + ) { return; } sid = next; @@ -381,174 +1710,650 @@ impl PikeVM { None => return, Some(&sid) => sid, }; + instrument!(|c| { + for &alt in &alternates[1..] { + c.record_stack_push(alt); + } + }); stack.extend( alternates[1..] .iter() .copied() .rev() - .map(FollowEpsilon::StateID), + .map(FollowEpsilon::Explore), ); } - State::Capture { next, slot } => { - if slot < thread_caps.len() { - stack.push(FollowEpsilon::Capture { + State::BinaryUnion { alt1, alt2 } => { + sid = alt1; + instrument!(|c| c.record_stack_push(sid)); + stack.push(FollowEpsilon::Explore(alt2)); + } + State::Capture { next, slot, .. } => { + // There's no need to do anything with slots that + // ultimately won't be copied into the caller-provided + // 'Captures' value. So we just skip dealing with them at + // all. + if slot.as_usize() < curr_slots.len() { + instrument!(|c| c.record_stack_push(sid)); + stack.push(FollowEpsilon::RestoreCapture { slot, - pos: thread_caps[slot], + offset: curr_slots[slot], }); - thread_caps[slot] = Some(at); + // OK because length of a slice must fit into an isize. + curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); } sid = next; } } } } + + /// Return the starting configuration of a PikeVM search. + /// + /// The "start config" is basically whether the search should be anchored + /// or not and the NFA state ID at which to begin the search. The state ID + /// returned always corresponds to an anchored starting state even when the + /// search is unanchored. This is because the PikeVM search loop deals with + /// unanchored searches with an explicit epsilon closure out of the start + /// state. + /// + /// This routine accounts for both the caller's `Input` configuration + /// and the pattern itself. For example, even if the caller asks for an + /// unanchored search, if the pattern itself is anchored, then this will + /// always return 'true' because implementing an unanchored search in that + /// case would be incorrect. + /// + /// Similarly, if the caller requests an anchored search for a particular + /// pattern, then the starting state ID returned will reflect that. + /// + /// If a pattern ID is given in the input configuration that is not in + /// this regex, then `None` is returned. + fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { + match input.get_anchored() { + // Only way we're unanchored is if both the caller asked for an + // unanchored search *and* the pattern is itself not anchored. + Anchored::No => Some(( + self.nfa.is_always_start_anchored(), + self.nfa.start_anchored(), + )), + Anchored::Yes => Some((true, self.nfa.start_anchored())), + Anchored::Pattern(pid) => { + Some((true, self.nfa.start_pattern(pid)?)) + } + } + } } -/// An iterator over all non-overlapping leftmost matches for a particular -/// infallible search. +/// An iterator over all non-overlapping matches for a particular search. /// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. +/// The iterator yields a [`Match`] value until no more matches could be found. /// -/// The lifetime variables are as follows: +/// The lifetime parameters are as follows: /// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'c` is the lifetime of the mutable cache used during search. -/// * `'t` is the lifetime of the text being searched. +/// * `'r` represents the lifetime of the PikeVM. +/// * `'c` represents the lifetime of the PikeVM's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`PikeVM::find_iter`] method. #[derive(Debug)] -pub struct FindLeftmostMatches<'r, 'c, 't> { - vm: &'r PikeVM, +pub struct FindMatches<'r, 'c, 'h> { + re: &'r PikeVM, cache: &'c mut Cache, - // scanner: Option<prefilter::Scanner<'r>>, - text: &'t [u8], - last_end: usize, - last_match: Option<usize>, + caps: Captures, + it: iter::Searcher<'h>, } -impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> { - fn new( - vm: &'r PikeVM, - cache: &'c mut Cache, - text: &'t [u8], - ) -> FindLeftmostMatches<'r, 'c, 't> { - FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None } +impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + // Splitting 'self' apart seems necessary to appease borrowck. + let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + // 'advance' converts errors into panics, which is OK here because + // the PikeVM can never return an error. + it.advance(|input| { + re.search(cache, input, caps); + Ok(caps.get_match()) + }) } } -impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> { - // type Item = Captures; - type Item = MultiMatch; +/// An iterator over all non-overlapping leftmost matches, with their capturing +/// groups, for a particular search. +/// +/// The iterator yields a [`Captures`] value until no more matches could be +/// found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the PikeVM. +/// * `'c` represents the lifetime of the PikeVM's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`PikeVM::captures_iter`] method. +#[derive(Debug)] +pub struct CapturesMatches<'r, 'c, 'h> { + re: &'r PikeVM, + cache: &'c mut Cache, + caps: Captures, + it: iter::Searcher<'h>, +} - // fn next(&mut self) -> Option<Captures> { - fn next(&mut self) -> Option<MultiMatch> { - if self.last_end > self.text.len() { - return None; - } - let mut caps = self.vm.create_captures(); - let m = self.vm.find_leftmost_at( - self.cache, - self.text, - self.last_end, - self.text.len(), - &mut caps, - )?; - if m.is_empty() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = if self.vm.config.get_utf8() { - crate::util::next_utf8(self.text, m.end()) - } else { - m.end() + 1 - }; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } +impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { + type Item = Captures; + + #[inline] + fn next(&mut self) -> Option<Captures> { + // Splitting 'self' apart seems necessary to appease borrowck. + let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + // 'advance' converts errors into panics, which is OK here because + // the PikeVM can never return an error. + it.advance(|input| { + re.search(cache, input, caps); + Ok(caps.get_match()) + }); + if caps.is_match() { + Some(caps.clone()) } else { - self.last_end = m.end(); + None } - self.last_match = Some(m.end()); - Some(m) } } +/// A cache represents mutable state that a [`PikeVM`] requires during a +/// search. +/// +/// For a given [`PikeVM`], its corresponding cache may be created either via +/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in +/// every way, except the former does not require explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the [`PikeVM`] from which it +/// was created. It may only be used with that `PikeVM`. A cache and its +/// allocations may be re-purposed via [`Cache::reset`], in which case, it can +/// only be used with the new `PikeVM` (and not the old one). #[derive(Clone, Debug)] -pub struct Captures { - slots: Vec<Slot>, +pub struct Cache { + /// Stack used while computing epsilon closure. This effectively lets us + /// move what is more naturally expressed through recursion to a stack + /// on the heap. + stack: Vec<FollowEpsilon>, + /// The current active states being explored for the current byte in the + /// haystack. + curr: ActiveStates, + /// The next set of states we're building that will be explored for the + /// next byte in the haystack. + next: ActiveStates, } -impl Captures { - pub fn new(nfa: &NFA) -> Captures { - Captures { slots: vec![None; nfa.capture_slot_len()] } +impl Cache { + /// Create a new [`PikeVM`] cache. + /// + /// A potentially more convenient routine to create a cache is + /// [`PikeVM::create_cache`], as it does not require also importing the + /// `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other `PikeVM`, + /// then you must call [`Cache::reset`] with the desired `PikeVM`. + pub fn new(re: &PikeVM) -> Cache { + Cache { + stack: vec![], + curr: ActiveStates::new(re), + next: ActiveStates::new(re), + } + } + + /// Reset this cache such that it can be used for searching with a + /// different [`PikeVM`]. + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `PikeVM`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `PikeVM`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re1 = PikeVM::new(r"\w")?; + /// let re2 = PikeVM::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the PikeVM we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &PikeVM) { + self.curr.reset(re); + self.next.reset(re); + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + use core::mem::size_of; + (self.stack.len() * size_of::<FollowEpsilon>()) + + self.curr.memory_usage() + + self.next.memory_usage() + } + + /// Clears this cache. This should be called at the start of every search + /// to ensure we start with a clean slate. + /// + /// This also sets the length of the capturing groups used in the current + /// search. This permits an optimization where by 'SlotTable::for_state' + /// only returns the number of slots equivalent to the number of slots + /// given in the 'Captures' value. This may be less than the total number + /// of possible slots, e.g., when one only wants to track overall match + /// offsets. This in turn permits less copying of capturing group spans + /// in the PikeVM. + fn setup_search(&mut self, captures_slot_len: usize) { + self.stack.clear(); + self.curr.setup_search(captures_slot_len); + self.next.setup_search(captures_slot_len); } } +/// A set of active states used to "simulate" the execution of an NFA via the +/// PikeVM. +/// +/// There are two sets of these used during NFA simulation. One set corresponds +/// to the "current" set of states being traversed for the current position +/// in a haystack. The other set corresponds to the "next" set of states being +/// built, which will become the new "current" set for the next position in the +/// haystack. These two sets correspond to CLIST and NLIST in Thompson's +/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 +/// +/// In addition to representing a set of NFA states, this also maintains slot +/// values for each state. These slot values are what turn the NFA simulation +/// into the "Pike VM." Namely, they track capturing group values for each +/// state. During the computation of epsilon closure, we copy slot values from +/// states in the "current" set to the "next" set. Eventually, once a match +/// is found, the slot values for that match state are what we write to the +/// caller provided 'Captures' value. #[derive(Clone, Debug)] -pub struct Cache { - stack: Vec<FollowEpsilon>, - clist: Threads, - nlist: Threads, +struct ActiveStates { + /// The set of active NFA states. This set preserves insertion order, which + /// is critical for simulating the match semantics of backtracking regex + /// engines. + set: SparseSet, + /// The slots for every NFA state, where each slot stores a (possibly + /// absent) offset. Every capturing group has two slots. One for a start + /// offset and one for an end offset. + slot_table: SlotTable, } -type Slot = Option<usize>; +impl ActiveStates { + /// Create a new set of active states for the given PikeVM. The active + /// states returned may only be used with the given PikeVM. (Use 'reset' + /// to re-purpose the allocation for a different PikeVM.) + fn new(re: &PikeVM) -> ActiveStates { + let mut active = ActiveStates { + set: SparseSet::new(0), + slot_table: SlotTable::new(), + }; + active.reset(re); + active + } + + /// Reset this set of active states such that it can be used with the given + /// PikeVM (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + self.set.resize(re.get_nfa().states().len()); + self.slot_table.reset(re); + } + + /// Return the heap memory usage, in bytes, used by this set of active + /// states. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.set.memory_usage() + self.slot_table.memory_usage() + } + /// Setup this set of active states for a new search. The given slot + /// length should be the number of slots in a caller provided 'Captures' + /// (and may be zero). + fn setup_search(&mut self, captures_slot_len: usize) { + self.set.clear(); + self.slot_table.setup_search(captures_slot_len); + } +} + +/// A table of slots, where each row represent a state in an NFA. Thus, the +/// table has room for storing slots for every single state in an NFA. +/// +/// This table is represented with a single contiguous allocation. In general, +/// the notion of "capturing group" doesn't really exist at this level of +/// abstraction, hence the name "slot" instead. (Indeed, every capturing group +/// maps to a pair of slots, one for the start offset and one for the end +/// offset.) Slots are indexed by the 'Captures' NFA state. +/// +/// N.B. Not every state actually needs a row of slots. Namely, states that +/// only have epsilon transitions currently never have anything written to +/// their rows in this table. Thus, the table is somewhat wasteful in its heap +/// usage. However, it is important to maintain fast random access by state +/// ID, which means one giant table tends to work well. RE2 takes a different +/// approach here and allocates each row as its own reference counted thing. +/// I explored such a strategy at one point here, but couldn't get it to work +/// well using entirely safe code. (To the ambitious reader: I encourage you to +/// re-litigate that experiment.) I very much wanted to stick to safe code, but +/// could be convinced otherwise if there was a solid argument and the safety +/// was encapsulated well. #[derive(Clone, Debug)] -struct Threads { - set: SparseSet, - caps: Vec<Slot>, - slots_per_thread: usize, +struct SlotTable { + /// The actual table of offsets. + table: Vec<Option<NonMaxUsize>>, + /// The number of slots per state, i.e., the table's stride or the length + /// of each row. + slots_per_state: usize, + /// The number of slots in the caller-provided 'Captures' value for the + /// current search. Setting this to 'slots_per_state' is always correct, + /// but may be wasteful. + slots_for_captures: usize, +} + +impl SlotTable { + /// Create a new slot table. + /// + /// One should call 'reset' with the corresponding PikeVM before use. + fn new() -> SlotTable { + SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } + } + + /// Reset this slot table such that it can be used with the given PikeVM + /// (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + let nfa = re.get_nfa(); + self.slots_per_state = nfa.group_info().slot_len(); + // This is always correct, but may be reduced for a particular search + // if a 'Captures' has fewer slots, e.g., none at all or only slots + // for tracking the overall match instead of all slots for every + // group. + self.slots_for_captures = core::cmp::max( + self.slots_per_state, + nfa.pattern_len().checked_mul(2).unwrap(), + ); + let len = nfa + .states() + .len() + .checked_mul(self.slots_per_state) + // Add space to account for scratch space used during a search. + .and_then(|x| x.checked_add(self.slots_for_captures)) + // It seems like this could actually panic on legitimate inputs on + // 32-bit targets, and very likely to panic on 16-bit. Should we + // somehow convert this to an error? What about something similar + // for the lazy DFA cache? If you're tripping this assert, please + // file a bug. + .expect("slot table length doesn't overflow"); + // This happens about as often as a regex is compiled, so it probably + // should be at debug level, but I found it quite distracting and not + // particularly useful. + trace!( + "resizing PikeVM active states table to {} entries \ + (slots_per_state={})", + len, + self.slots_per_state, + ); + self.table.resize(len, None); + } + + /// Return the heap memory usage, in bytes, used by this slot table. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>() + } + + /// Perform any per-search setup for this slot table. + /// + /// In particular, this sets the length of the number of slots used in the + /// 'Captures' given by the caller (if any at all). This number may be + /// smaller than the total number of slots available, e.g., when the caller + /// is only interested in tracking the overall match and not the spans of + /// every matching capturing group. Only tracking the overall match can + /// save a substantial amount of time copying capturing spans during a + /// search. + fn setup_search(&mut self, captures_slot_len: usize) { + self.slots_for_captures = captures_slot_len; + } + + /// Return a mutable slice of the slots for the given state. + /// + /// Note that the length of the slice returned may be less than the total + /// number of slots available for this state. In particular, the length + /// always matches the number of slots indicated via 'setup_search'. + fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { + let i = sid.as_usize() * self.slots_per_state; + &mut self.table[i..i + self.slots_for_captures] + } + + /// Return a slice of slots of appropriate length where every slot offset + /// is guaranteed to be absent. This is useful in cases where you need to + /// compute an epsilon closure outside of the user supplied regex, and thus + /// never want it to have any capturing slots set. + fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { + let i = self.table.len() - self.slots_for_captures; + &mut self.table[i..i + self.slots_for_captures] + } } +/// Represents a stack frame for use while computing an epsilon closure. +/// +/// (An "epsilon closure" refers to the set of reachable NFA states from a +/// single state without consuming any input. That is, the set of all epsilon +/// transitions not only from that single state, but from every other state +/// reachable by an epsilon transition as well. This is why it's called a +/// "closure." Computing an epsilon closure is also done during DFA +/// determinization! Compare and contrast the epsilon closure here in this +/// PikeVM and the one used for determinization in crate::util::determinize.) +/// +/// Computing the epsilon closure in a Thompson NFA proceeds via a depth +/// first traversal over all epsilon transitions from a particular state. +/// (A depth first traversal is important because it emulates the same priority +/// of matches that is typically found in backtracking regex engines.) This +/// depth first traversal is naturally expressed using recursion, but to avoid +/// a call stack size proportional to the size of a regex, we put our stack on +/// the heap instead. +/// +/// This stack thus consists of call frames. The typical call frame is +/// `Explore`, which instructs epsilon closure to explore the epsilon +/// transitions from that state. (Subsequent epsilon transitions are then +/// pushed on to the stack as more `Explore` frames.) If the state ID being +/// explored has no epsilon transitions, then the capturing group slots are +/// copied from the original state that sparked the epsilon closure (from the +/// 'step' routine) to the state ID being explored. This way, capturing group +/// slots are forwarded from the previous state to the next. +/// +/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to +/// set the position for a particular slot back to some particular offset. This +/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will +/// set the offset of the slot indicated in `Capture` to the current offset, +/// and then push the old offset on to the stack as a `RestoreCapture` frame. +/// Thus, the new offset is only used until the epsilon closure reverts back to +/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon +/// transition its "scope" to only states that come "after" it during depth +/// first traversal. #[derive(Clone, Debug)] enum FollowEpsilon { - StateID(StateID), - Capture { slot: usize, pos: Slot }, + /// Explore the epsilon transitions from a state ID. + Explore(StateID), + /// Reset the given `slot` to the given `offset` (which might be `None`). + RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, } -impl Cache { - pub fn new(nfa: &NFA) -> Cache { - Cache { - stack: vec![], - clist: Threads::new(nfa), - nlist: Threads::new(nfa), +/// A set of counters that "instruments" a PikeVM search. To enable this, you +/// must enable the 'internal-instrument-pikevm' feature. Then run your Rust +/// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in +/// the environment. The metrics collected will be dumped automatically for +/// every search executed by the PikeVM. +/// +/// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an +/// absolute decrease in wall-clock performance, even if the 'trace' log level +/// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't +/// enabled.) The main point of instrumentation is to get counts of various +/// events that occur during the PikeVM's execution. +/// +/// This is a somewhat hacked together collection of metrics that are useful +/// to gather from a PikeVM search. In particular, it lets us scrutinize the +/// performance profile of a search beyond what general purpose profiling tools +/// give us. Namely, we orient the profiling data around the specific states of +/// the NFA. +/// +/// In other words, this lets us see which parts of the NFA graph are most +/// frequently activated. This then provides direction for optimization +/// opportunities. +/// +/// The really sad part about this is that it absolutely clutters up the PikeVM +/// implementation. :'( Another approach would be to just manually add this +/// code in whenever I want this kind of profiling data, but it's complicated +/// and tedious enough that I went with this approach... for now. +/// +/// When instrumentation is enabled (which also turns on 'logging'), then a +/// `Counters` is initialized for every search and `trace`'d just before the +/// search returns to the caller. +/// +/// Tip: When debugging performance problems with the PikeVM, it's best to try +/// to work with an NFA that is as small as possible. Otherwise the state graph +/// is likely to be too big to digest. +#[cfg(feature = "internal-instrument-pikevm")] +#[derive(Clone, Debug)] +struct Counters { + /// The number of times the NFA is in a particular permutation of states. + state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>, + /// The number of times 'step' is called for a particular state ID (which + /// indexes this array). + steps: Vec<u64>, + /// The number of times an epsilon closure was computed for a state. + closures: Vec<u64>, + /// The number of times a particular state ID is pushed on to a stack while + /// computing an epsilon closure. + stack_pushes: Vec<u64>, + /// The number of times a particular state ID is inserted into a sparse set + /// while computing an epsilon closure. + set_inserts: Vec<u64>, +} + +#[cfg(feature = "internal-instrument-pikevm")] +impl Counters { + fn empty() -> Counters { + Counters { + state_sets: alloc::collections::BTreeMap::new(), + steps: vec![], + closures: vec![], + stack_pushes: vec![], + set_inserts: vec![], } } - fn clear(&mut self) { - self.stack.clear(); - self.clist.set.clear(); - self.nlist.set.clear(); + fn reset(&mut self, nfa: &NFA) { + let len = nfa.states().len(); + + self.state_sets.clear(); + + self.steps.clear(); + self.steps.resize(len, 0); + + self.closures.clear(); + self.closures.resize(len, 0); + + self.stack_pushes.clear(); + self.stack_pushes.resize(len, 0); + + self.set_inserts.clear(); + self.set_inserts.resize(len, 0); + } + + fn eprint(&self, nfa: &NFA) { + trace!("===== START PikeVM Instrumentation Output ====="); + // We take the top-K most occurring state sets. Otherwise the output + // is likely to be overwhelming. And we probably only care about the + // most frequently occurring ones anyway. + const LIMIT: usize = 20; + let mut set_counts = + self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>(); + set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); + trace!("## PikeVM frequency of state sets (top {})", LIMIT); + for (set, count) in set_counts.iter().take(LIMIT) { + trace!("{:?}: {}", set, count); + } + if set_counts.len() > LIMIT { + trace!( + "... {} sets omitted (out of {} total)", + set_counts.len() - LIMIT, + set_counts.len(), + ); + } + + trace!(""); + trace!("## PikeVM total frequency of events"); + trace!( + "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}", + self.steps.iter().copied().sum::<u64>(), + self.closures.iter().copied().sum::<u64>(), + self.stack_pushes.iter().copied().sum::<u64>(), + self.set_inserts.iter().copied().sum::<u64>(), + ); + + trace!(""); + trace!("## PikeVM frequency of events broken down by state"); + for sid in 0..self.steps.len() { + trace!( + "{:06}: steps: {}, closures: {}, \ + stack-pushes: {}, set-inserts: {}", + sid, + self.steps[sid], + self.closures[sid], + self.stack_pushes[sid], + self.set_inserts[sid], + ); + } + + trace!(""); + trace!("## NFA debug display"); + trace!("{:?}", nfa); + trace!("===== END PikeVM Instrumentation Output ====="); } - fn swap(&mut self) { - core::mem::swap(&mut self.clist, &mut self.nlist); + fn record_state_set(&mut self, set: &SparseSet) { + let set = set.iter().collect::<Vec<StateID>>(); + *self.state_sets.entry(set).or_insert(0) += 1; } -} -impl Threads { - fn new(nfa: &NFA) -> Threads { - let mut threads = Threads { - set: SparseSet::new(0), - caps: vec![], - slots_per_thread: 0, - }; - threads.resize(nfa); - threads + fn record_step(&mut self, sid: StateID) { + self.steps[sid] += 1; } - fn resize(&mut self, nfa: &NFA) { - if nfa.states().len() == self.set.capacity() { - return; - } - self.slots_per_thread = nfa.capture_slot_len(); - self.set.resize(nfa.states().len()); - self.caps.resize(self.slots_per_thread * nfa.states().len(), None); + fn record_closure(&mut self, sid: StateID) { + self.closures[sid] += 1; + } + + fn record_stack_push(&mut self, sid: StateID) { + self.stack_pushes[sid] += 1; } - fn caps(&mut self, sid: StateID) -> &mut [Slot] { - let i = sid.as_usize() * self.slots_per_thread; - &mut self.caps[i..i + self.slots_per_thread] + fn record_set_insert(&mut self, sid: StateID) { + self.set_inserts[sid] += 1; } } |