/*! An NFA backed Pike VM for executing regex searches with capturing groups. This module provides a [`PikeVM`] that works by simulating an NFA and resolving all spans of capturing groups that participate in a match. */ #[cfg(feature = "internal-instrument-pikevm")] use core::cell::RefCell; use alloc::{vec, vec::Vec}; use crate::{ nfa::thompson::{self, BuildError, State, NFA}, util::{ captures::Captures, empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, search::{ Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, }, sparse_set::SparseSet, }, }; /// A simple macro for conditionally executing instrumentation logic when /// the 'trace' log level is enabled. This is a compile-time no-op when the /// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that /// this makes it easier to avoid doing extra work when instrumentation isn't /// enabled. /// /// This macro accepts a closure of type `|&mut Counters|`. The closure can /// then increment counters (or whatever) in accordance with what one wants /// to track. macro_rules! instrument { ($fun:expr) => { #[cfg(feature = "internal-instrument-pikevm")] { let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; COUNTERS.with(|c: &RefCell| fun(&mut *c.borrow_mut())); } }; } #[cfg(feature = "internal-instrument-pikevm")] std::thread_local! { /// Effectively global state used to keep track of instrumentation /// counters. The "proper" way to do this is to thread it through the /// PikeVM, but it makes the code quite icky. Since this is just a /// debugging feature, we're content to relegate it to thread local /// state. When instrumentation is enabled, the counters are reset at the /// beginning of every search and printed (with the 'trace' log level) at /// the end of every search. static COUNTERS: RefCell = RefCell::new(Counters::empty()); } /// The configuration used for building a [`PikeVM`]. /// /// A PikeVM configuration is a simple data object that is typically used with /// [`Builder::configure`]. It can be cheaply cloned. /// /// A default configuration can be created either with `Config::new`, or /// perhaps more conveniently, with [`PikeVM::config`]. #[derive(Clone, Debug, Default)] pub struct Config { match_kind: Option, pre: Option>, } impl Config { /// Return a new default PikeVM configuration. pub fn new() -> Config { Config::default() } /// Set the desired match semantics. /// /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the /// match semantics of Perl-like regex engines. That is, when multiple /// patterns would match at the same leftmost position, the pattern that /// appears first in the concrete syntax is chosen. /// /// Currently, the only other kind of match semantics supported is /// [`MatchKind::All`]. This corresponds to "classical DFA" construction /// where all possible matches are visited in the NFA by the `PikeVM`. /// /// Typically, `All` is used when one wants to execute an overlapping /// search and `LeftmostFirst` otherwise. In particular, it rarely makes /// sense to use `All` with the various "leftmost" find routines, since the /// leftmost routines depend on the `LeftmostFirst` automata construction /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` /// simulating dead states as a way to terminate the search and report a /// match. `LeftmostFirst` also supports non-greedy matches using this /// strategy where as `All` does not. pub fn match_kind(mut self, kind: MatchKind) -> Config { self.match_kind = Some(kind); self } /// Set a prefilter to be used whenever a start state is entered. /// /// A [`Prefilter`] in this context is meant to accelerate searches by /// looking for literal prefixes that every match for the corresponding /// pattern (or patterns) must start with. Once a prefilter produces a /// match, the underlying search routine continues on to try and confirm /// the match. /// /// Be warned that setting a prefilter does not guarantee that the search /// will be faster. While it's usually a good bet, if the prefilter /// produces a lot of false positive candidates (i.e., positions matched /// by the prefilter but not by the regex), then the overall result can /// be slower than if you had just executed the regex engine without any /// prefilters. /// /// By default no prefilter is set. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::prefilter::Prefilter, /// Input, Match, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); /// let re = PikeVM::builder() /// .configure(PikeVM::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("foo1 barfox bar"); /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); /// /// # Ok::<(), Box>(()) /// ``` /// /// Be warned though that an incorrect prefilter can lead to incorrect /// results! /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::prefilter::Prefilter, /// Input, HalfMatch, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); /// let re = PikeVM::builder() /// .configure(PikeVM::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("foo1 barfox bar"); /// // No match reported even though there clearly is one! /// assert_eq!(None, re.find(&mut cache, input)); /// /// # Ok::<(), Box>(()) /// ``` pub fn prefilter(mut self, pre: Option) -> Config { self.pre = Some(pre); self } /// Returns the match semantics set in this configuration. pub fn get_match_kind(&self) -> MatchKind { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } /// Returns the prefilter set in this configuration, if one at all. pub fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref().unwrap_or(&None).as_ref() } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(&self, o: Config) -> Config { Config { match_kind: o.match_kind.or(self.match_kind), pre: o.pre.or_else(|| self.pre.clone()), } } } /// A builder for a `PikeVM`. /// /// This builder permits configuring options for the syntax of a pattern, /// the NFA construction and the `PikeVM` construction. This builder is /// different from a general purpose regex builder in that it permits fine /// grain configuration of the construction process. The trade off for this is /// complexity, and the possibility of setting a configuration that might not /// make sense. For example, there are two different UTF-8 modes: /// /// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) /// controls whether the pattern itself can contain sub-expressions that match /// invalid UTF-8. /// * [`thompson::Config::utf8`] controls whether empty matches that split a /// Unicode codepoint are reported or not. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. /// /// # Example /// /// This example shows how to disable UTF-8 mode in the syntax and the regex /// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// util::syntax, /// Match, /// }; /// /// let re = PikeVM::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// let got = re.find_iter(&mut cache, haystack).next(); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 /// // on the syntax permits this. /// // /// // N.B. This example does not show the impact of /// // disabling UTF-8 mode on a PikeVM Config, since that /// // only impacts regexes that can produce matches of /// // length 0. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, #[cfg(feature = "syntax")] thompson: thompson::Compiler, } impl Builder { /// Create a new PikeVM builder with its default configuration. pub fn new() -> Builder { Builder { config: Config::default(), #[cfg(feature = "syntax")] thompson: thompson::Compiler::new(), } } /// Build a `PikeVM` from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a `PikeVM` from the given patterns. #[cfg(feature = "syntax")] pub fn build_many>( &self, patterns: &[P], ) -> Result { let nfa = self.thompson.build_many(patterns)?; self.build_from_nfa(nfa) } /// Build a `PikeVM` directly from its NFA. /// /// Note that when using this method, any configuration that applies to the /// construction of the NFA itself will of course be ignored, since the NFA /// given here is already built. pub fn build_from_nfa(&self, nfa: NFA) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; Ok(PikeVM { config: self.config.clone(), nfa }) } /// Apply the given `PikeVM` configuration options to this builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a PikeVM directly from a /// pattern. #[cfg(feature = "syntax")] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like if additional time should be spent /// shrinking the size of the NFA. /// /// These settings only apply when constructing a PikeVM directly from a /// pattern. #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } /// A virtual machine for executing regex searches with capturing groups. /// /// # Infallible APIs /// /// Unlike most other regex engines in this crate, a `PikeVM` never returns an /// error at search time. It supports all [`Anchored`] configurations, never /// quits and works on haystacks of arbitrary length. /// /// There are two caveats to mention though: /// /// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], /// then the PikeVM will report "no match." This is consistent with all other /// regex engines in this crate. /// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] /// that has insufficient capacity to store all valid pattern IDs, then if a /// match occurs for a `PatternID` that cannot be inserted, it is silently /// dropped as if it did not match. /// /// # Advice /// /// The `PikeVM` is generally the most "powerful" regex engine in this crate. /// "Powerful" in this context means that it can handle any regular expression /// that is parseable by `regex-syntax` and any size haystack. Regretably, /// the `PikeVM` is also simultaneously often the _slowest_ regex engine in /// practice. This results in an annoying situation where one generally tries /// to pick any other regex engine (or perhaps none at all) before being /// forced to fall back to a `PikeVM`. /// /// For example, a common strategy for dealing with capturing groups is to /// actually look for the overall match of the regex using a faster regex /// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall /// match is found, one can then run the `PikeVM` on just the match span to /// find the spans of the capturing groups. In this way, the faster regex /// engine does the majority of the work, while the `PikeVM` only lends its /// power in a more limited role. /// /// Unfortunately, this isn't always possible because the faster regex engines /// don't support all of the regex features in `regex-syntax`. This notably /// includes (and is currently limited to) Unicode word boundaries. So if /// your pattern has Unicode word boundaries, you typically can't use a /// DFA-based regex engine at all (unless you [enable heuristic support for /// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass /// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for /// anchored searches only, but in a cruel sort of joke, many Unicode features /// tend to result in making the regex _not_ one-pass.) /// /// # Example /// /// This example shows that the `PikeVM` implements Unicode word boundaries /// correctly by default. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new(r"\b\w+\b")?; /// let mut cache = re.create_cache(); /// /// let mut it = re.find_iter(&mut cache, "Шерлок Холмс"); /// assert_eq!(Some(Match::must(0, 0..12)), it.next()); /// assert_eq!(Some(Match::must(0, 13..23)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct PikeVM { config: Config, nfa: NFA, } impl PikeVM { /// Parse the given regular expression using the default configuration and /// return the corresponding `PikeVM`. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 3..14)), /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(), /// ); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result { PikeVM::builder().build(pattern) } /// Like `new`, but parses multiple patterns into a single "multi regex." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?; /// let mut cache = re.create_cache(); /// /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>( patterns: &[P], ) -> Result { PikeVM::builder().build_many(patterns) } /// Like `new`, but builds a PikeVM directly from an NFA. This is useful /// if you already have an NFA, or even if you hand-assembled the NFA. /// /// # Example /// /// This shows how to hand assemble a regular expression via its HIR, /// compile an NFA from it and build a PikeVM from the NFA. /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; /// /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ /// ClassBytesRange::new(b'0', b'9'), /// ClassBytesRange::new(b'A', b'Z'), /// ClassBytesRange::new(b'_', b'_'), /// ClassBytesRange::new(b'a', b'z'), /// ]))); /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; /// /// let re = PikeVM::new_from_nfa(nfa)?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let expected = Some(Match::must(0, 3..4)); /// re.captures(&mut cache, "!@#A#@!", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn new_from_nfa(nfa: NFA) -> Result { PikeVM::builder().build_from_nfa(nfa) } /// Create a new `PikeVM` that matches every input. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::always_match()?; /// let mut cache = re.create_cache(); /// /// let expected = Match::must(0, 0..0); /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next()); /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next()); /// # Ok::<(), Box>(()) /// ``` pub fn always_match() -> Result { let nfa = thompson::NFA::always_match(); PikeVM::new_from_nfa(nfa) } /// Create a new `PikeVM` that never matches any input. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::never_match()?; /// let mut cache = re.create_cache(); /// /// assert_eq!(None, re.find_iter(&mut cache, "").next()); /// assert_eq!(None, re.find_iter(&mut cache, "foo").next()); /// # Ok::<(), Box>(()) /// ``` pub fn never_match() -> Result { let nfa = thompson::NFA::never_match(); PikeVM::new_from_nfa(nfa) } /// Return a default configuration for a `PikeVM`. /// /// This is a convenience routine to avoid needing to import the `Config` /// type when customizing the construction of a `PikeVM`. /// /// # Example /// /// This example shows how to disable UTF-8 mode. When UTF-8 mode is /// disabled, zero-width matches that split a codepoint are allowed. /// Otherwise they are never reported. /// /// In the code below, notice that `""` is permitted to match positions /// that split the encoding of a codepoint. /// /// ``` /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; /// /// let re = PikeVM::builder() /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"")?; /// let mut cache = re.create_cache(); /// /// let haystack = "a☃z"; /// let mut it = re.find_iter(&mut cache, haystack); /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` pub fn config() -> Config { Config::new() } /// Return a builder for configuring the construction of a `PikeVM`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode /// everywhere. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// util::syntax, /// Match, /// }; /// /// let re = PikeVM::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// re.captures(&mut cache, haystack, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } /// Create a new empty set of capturing groups that is guaranteed to be /// valid for the search APIs on this `PikeVM`. /// /// A `Captures` value created for a specific `PikeVM` cannot be used with /// any other `PikeVM`. /// /// This is a convenience function for [`Captures::all`]. See the /// [`Captures`] documentation for an explanation of its alternative /// constructors that permit the `PikeVM` to do less work during a search, /// and thus might make it faster. pub fn create_captures(&self) -> Captures { Captures::all(self.get_nfa().group_info().clone()) } /// Create a new cache for this `PikeVM`. /// /// The cache returned should only be used for searches for this /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, /// [`PikeVM::reset_cache`]). pub fn create_cache(&self) -> Cache { Cache::new(self) } /// Reset the given cache such that it can be used for searching with the /// this `PikeVM` (and only this `PikeVM`). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `PikeVM`. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `PikeVM`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re1 = PikeVM::new(r"\w")?; /// let re2 = PikeVM::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.find_iter(&mut cache, "Δ").next(), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the PikeVM we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// re2.reset_cache(&mut cache); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.find_iter(&mut cache, "☃").next(), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset_cache(&self, cache: &mut Cache) { cache.reset(self); } /// Returns the total number of patterns compiled into this `PikeVM`. /// /// In the case of a `PikeVM` that contains no patterns, this returns `0`. /// /// # Example /// /// This example shows the pattern length for a `PikeVM` that never /// matches: /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::never_match()?; /// assert_eq!(re.pattern_len(), 0); /// # Ok::<(), Box>(()) /// ``` /// /// And another example for a `PikeVM` that matches at every position: /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::always_match()?; /// assert_eq!(re.pattern_len(), 1); /// # Ok::<(), Box>(()) /// ``` /// /// And finally, a `PikeVM` that was constructed from multiple patterns: /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; /// assert_eq!(re.pattern_len(), 3); /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { self.nfa.pattern_len() } /// Return the config for this `PikeVM`. #[inline] pub fn get_config(&self) -> &Config { &self.config } /// Returns a reference to the underlying NFA. #[inline] pub fn get_nfa(&self) -> &NFA { &self.nfa } } impl PikeVM { /// Returns true if and only if this `PikeVM` matches the given haystack. /// /// This routine may short circuit if it knows that scanning future /// input will never lead to a different result. In particular, if the /// underlying NFA enters a match state, then this routine will return /// `true` immediately without inspecting any future input. (Consider how /// this might make a difference given the regex `a+` on the haystack /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, /// but routines like `find` need to continue searching because `+` is /// greedy by default.) /// /// # Example /// /// This shows basic usage: /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, "foo12345bar")); /// assert!(!re.is_match(&mut cache, "foobar")); /// # Ok::<(), Box>(()) /// ``` /// /// # Example: consistency with search APIs /// /// `is_match` is guaranteed to return `true` whenever `find` returns a /// match. This includes searches that are executed entirely within a /// codepoint: /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; /// /// let re = PikeVM::new("a*")?; /// let mut cache = re.create_cache(); /// /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); /// # Ok::<(), Box>(()) /// ``` /// /// Notice that when UTF-8 mode is disabled, then the above reports a /// match because the restriction against zero-width matches that split a /// codepoint has been lifted: /// /// ``` /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; /// /// let re = PikeVM::builder() /// .thompson(NFA::config().utf8(false)) /// .build("a*")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> bool { let input = input.into().earliest(true); self.search_slots(cache, &input, &mut []).is_some() } /// Executes a leftmost forward search and returns a `Match` if one exists. /// /// This routine only includes the overall match span. To get access to the /// individual spans of each capturing group, use [`PikeVM::captures`]. /// /// # Example /// /// Leftmost first match semantics corresponds to the match with the /// smallest starting offset, but where the end offset is determined by /// preferring earlier branches in the original regular expression. For /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` /// will match `Samwise` in `Samwise`. /// /// Generally speaking, the "leftmost first" match is how most backtracking /// regular expressions tend to work. This is in contrast to POSIX-style /// regular expressions that yield "leftmost longest" matches. Namely, /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using /// leftmost longest semantics. (This crate does not currently support /// leftmost longest semantics.) /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// let expected = Match::must(0, 0..8); /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over later parts. /// let re = PikeVM::new("abc|a")?; /// let mut cache = re.create_cache(); /// let expected = Match::must(0, 0..3); /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> Option { let input = input.into(); if self.get_nfa().pattern_len() == 1 { let mut slots = [None, None]; let pid = self.search_slots(cache, &input, &mut slots)?; let start = slots[0]?.get(); let end = slots[1]?.get(); return Some(Match::new(pid, Span { start, end })); } let ginfo = self.get_nfa().group_info(); let slots_len = ginfo.implicit_slot_len(); let mut slots = vec![None; slots_len]; let pid = self.search_slots(cache, &input, &mut slots)?; let start = slots[pid.as_usize() * 2]?.get(); let end = slots[pid.as_usize() * 2 + 1]?.get(); Some(Match::new(pid, Span { start, end })) } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "2010-03-14", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures<'h, I: Into>>( &self, cache: &mut Cache, input: I, caps: &mut Captures, ) { self.search(cache, &input.into(), caps) } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(&mut cache, text).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter<'r, 'c, 'h, I: Into>>( &'r self, cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h> { let caps = Captures::matches(self.get_nfa().group_info().clone()); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, caps, it } } /// Returns an iterator over all non-overlapping `Captures` values. If no /// match exists, then the iterator yields no elements. /// /// This yields the same matches as [`PikeVM::find_iter`], but it includes /// the spans of all capturing groups that participate in each match. /// /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for /// how to correctly iterate over all matches in a haystack while avoiding /// the creation of a new `Captures` value for every match. (Which you are /// forced to do with an `Iterator`.) /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new("foo(?P[0-9]+)")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re /// .captures_iter(&mut cache, text) /// // The unwrap is OK since 'numbers' matches if the pattern matches. /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) /// .collect(); /// assert_eq!(matches, vec![ /// Span::from(3..4), /// Span::from(8..10), /// Span::from(14..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures_iter<'r, 'c, 'h, I: Into>>( &'r self, cache: &'c mut Cache, input: I, ) -> CapturesMatches<'r, 'c, 'h> { let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } } } impl PikeVM { /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` /// instead of an `Into`. /// /// # Example: specific pattern search /// /// This example shows how to build a multi-PikeVM that permits searching /// for specific patterns. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Anchored, Match, PatternID, Input, /// }; /// /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(Match::must(0, 0..6)); /// re.search(&mut cache, &Input::new(haystack), &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(Match::must(1, 0..6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; /// /// let re = PikeVM::new(r"\b[0-9]{3}\b")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about /// // the larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `0..3` instead of /// // `3..6`. /// let expected = Some(Match::must(0, 0..3)); /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let input = Input::new(haystack).range(3..6); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, ) { caps.set_pattern(None); let pid = self.search_slots(cache, input, caps.slots_mut()); caps.set_pattern(pid); } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided `slots`, and /// returns the matching pattern ID. The contents of the slots for patterns /// other than the matching pattern are unspecified. If no match was found, /// then `None` is returned and the contents of `slots` is unspecified. /// /// This is like [`PikeVM::search`], but it accepts a raw slots slice /// instead of a `Captures` value. This is useful in contexts where you /// don't want or need to allocate a `Captures`. /// /// It is legal to pass _any_ number of slots to this routine. If the regex /// engine would otherwise write a slot offset that doesn't fit in the /// provided slice, then it is simply skipped. In general though, there are /// usually three slice lengths you might want to use: /// /// * An empty slice, if you only care about which pattern matched. /// * A slice with /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) /// slots, if you only care about the overall match spans for each matching /// pattern. /// * A slice with /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which /// permits recording match offsets for every capturing group in every /// pattern. /// /// # Example /// /// This example shows how to find the overall match offsets in a /// multi-pattern search without allocating a `Captures` value. Indeed, we /// can put our slots right on the stack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; /// /// let re = PikeVM::new_many(&[ /// r"\pL+", /// r"\d+", /// ])?; /// let mut cache = re.create_cache(); /// let input = Input::new("!@#123"); /// /// // We only care about the overall match offsets here, so we just /// // allocate two slots for each pattern. Each slot records the start /// // and end of the match. /// let mut slots = [None; 4]; /// let pid = re.search_slots(&mut cache, &input, &mut slots); /// assert_eq!(Some(PatternID::must(1)), pid); /// /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. /// // See 'GroupInfo' for more details on the mapping between groups and /// // slot indices. /// let slot_start = pid.unwrap().as_usize() * 2; /// let slot_end = slot_start + 1; /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { let hm = self.search_slots_imp(cache, input, slots)?; return Some(hm.pattern()); } // There is an unfortunate special case where if the regex can // match the empty string and UTF-8 mode is enabled, the search // implementation requires that the slots have at least as much space // to report the bounds of any match. This is so zero-width matches // that split a codepoint can be filtered out. // // Note that if utf8empty is true, we specialize the case for when // the number of patterns is 1. In that case, we can just use a stack // allocation. Otherwise we resort to a heap allocation, which we // convince ourselves we're fine with due to the pathological nature of // this case. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { let hm = self.search_slots_imp(cache, input, slots)?; return Some(hm.pattern()); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; let got = self.search_slots_imp(cache, input, &mut enough); // This is OK because we know `enough` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); return got.map(|hm| hm.pattern()); } let mut enough = vec![None; min]; let got = self.search_slots_imp(cache, input, &mut enough); // This is OK because we know `enough` is strictly bigger than `slots`, // otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); got.map(|hm| hm.pattern()) } /// This is the actual implementation of `search_slots_imp` that /// doesn't account for the special case when 1) the NFA has UTF-8 mode /// enabled, 2) the NFA can match the empty string and 3) the caller has /// provided an insufficient number of slots to record match offsets. #[inline(never)] fn search_slots_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let hm = match self.search_imp(cache, input, slots) { None => return None, Some(hm) if !utf8empty => return Some(hm), Some(hm) => hm, }; empty::skip_splits_fwd(input, hm, hm.offset(), |input| { Ok(self .search_imp(cache, input, slots) .map(|hm| (hm, hm.offset()))) }) // OK because the PikeVM never errors. .unwrap() } /// Writes the set of patterns that match anywhere in the given search /// configuration to `patset`. If multiple patterns match at the same /// position and this `PikeVM` was configured with [`MatchKind::All`] /// semantics, then all matching patterns are written to the given set. /// /// Unless all of the patterns in this `PikeVM` are anchored, then /// generally speaking, this will visit every byte in the haystack. /// /// This search routine *does not* clear the pattern set. This gives some /// flexibility to the caller (e.g., running multiple searches with the /// same pattern set), but does make the API bug-prone if you're reusing /// the same pattern set for multiple searches but intended them to be /// independent. /// /// If a pattern ID matched but the given `PatternSet` does not have /// sufficient capacity to store it, then it is not inserted and silently /// dropped. /// /// # Example /// /// This example shows how to find all matching patterns in a haystack, /// even when some patterns match at the same position as other patterns. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Input, MatchKind, PatternSet, /// }; /// /// let patterns = &[ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", /// ]; /// let re = PikeVM::builder() /// .configure(PikeVM::config().match_kind(MatchKind::All)) /// .build_many(patterns)?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("foobar"); /// let mut patset = PatternSet::new(re.pattern_len()); /// re.which_overlapping_matches(&mut cache, &input, &mut patset); /// let expected = vec![0, 2, 3, 4, 6]; /// let got: Vec = patset.iter().map(|p| p.as_usize()).collect(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { self.which_overlapping_imp(cache, input, patset) } } impl PikeVM { /// The implementation of standard leftmost search. /// /// Capturing group spans are written to `slots`, but only if requested. /// `slots` can be any length. Any slot in the NFA that is activated but /// which is out of bounds for the given `slots` is ignored. fn search_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { cache.setup_search(slots.len()); if input.is_done() { return None; } // Why do we even care about this? Well, in our 'Captures' // representation, we use usize::MAX as a sentinel to indicate "no // match." This isn't problematic so long as our haystack doesn't have // a maximal length. Byte slices are guaranteed by Rust to have a // length that fits into isize, and so this assert should always pass. // But we put it here to make our assumption explicit. assert!( input.haystack().len() < core::usize::MAX, "byte slice lengths must be less than usize MAX", ); instrument!(|c| c.reset(&self.nfa)); // Whether we want to visit all match states instead of emulating the // 'leftmost' semantics of typical backtracking regex engines. let allmatches = self.config.get_match_kind().continue_past_first_match(); let (anchored, start_id) = match self.start_config(input) { None => return None, Some(config) => config, }; let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like // how the DFA engines work. The delay is used to handle look-behind // assertions. In the case of the PikeVM, the delay is implemented // by not considering a match to exist until it is visited in // 'steps'. Technically, we know a match exists in the previous // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA // determinization. We don't mark a DFA state as a match state if it // contains an NFA match state, but rather, whether the DFA state was // generated by a transition from a DFA state that contains an NFA // match state.) let mut at = input.start(); while at <= input.end() { // If we have no states left to visit, then there are some cases // where we know we can quit early or even skip ahead. if curr.set.is_empty() { // We have a match and we haven't been instructed to continue // on even after finding a match, so we can quit. if hm.is_some() && !allmatches { break; } // If we're running an anchored search and we've advanced // beyond the start position with no other states to try, then // we will never observe a match and thus can stop. if anchored && at > input.start() { break; } // If there no states left to explore at this position and we // know we can't terminate early, then we are effectively at // the starting state of the NFA. If we fell through here, // we'd end up adding our '(?s-u:.)*?' prefix and it would be // the only thing in 'curr'. So we might as well just skip // ahead until we find something that we know might advance us // forward. if let Some(ref pre) = pre { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, Some(ref span) => at = span.start, } } } // Instead of using the NFA's unanchored start state, we actually // always use its anchored starting state. As a result, when doing // an unanchored search, we need to simulate our own '(?s-u:.)*?' // prefix, to permit a match to appear anywhere. // // Now, we don't *have* to do things this way. We could use the // NFA's unanchored starting state and do one 'epsilon_closure' // call from that starting state before the main loop here. And // that is just as correct. However, it turns out to be slower // than our approach here because it slightly increases the cost // of processing each byte by requiring us to visit more NFA // states to deal with the additional NFA states in the unanchored // prefix. By simulating it explicitly here, we lower those costs // substantially. The cost is itself small, but it adds up for // large haystacks. // // In order to simulate the '(?s-u:.)*?' prefix---which is not // greedy---we are careful not to perform an epsilon closure on // the start state if we already have a match. Namely, if we // did otherwise, we would never reach a terminating condition // because there would always be additional states to process. // In effect, the exclusion of running 'epsilon_closure' when // we have a match corresponds to the "dead" states we have in // our DFA regex engines. Namely, in a DFA, match states merely // instruct the search execution to record the current offset as // the most recently seen match. It is the dead state that actually // indicates when to stop the search (other than EOF or quit // states). // // However, when 'allmatches' is true, the caller has asked us to // leave in every possible match state. This tends not to make a // whole lot of sense in unanchored searches, because it means the // search really cannot terminate until EOF. And often, in that // case, you wind up skipping over a bunch of matches and are left // with the "last" match. Arguably, it just doesn't make a lot of // sense to run a 'leftmost' search (which is what this routine is) // with 'allmatches' set to true. But the DFAs support it and this // matches their behavior. (Generally, 'allmatches' is useful for // overlapping searches or leftmost anchored searches to find the // longest possible match by ignoring match priority.) // // Additionally, when we're running an anchored search, this // epsilon closure should only be computed at the beginning of the // search. If we re-computed it at every position, we would be // simulating an unanchored search when we were tasked to perform // an anchored search. if (!hm.is_some() || allmatches) && (!anchored || at == input.start()) { // Since we are adding to the 'curr' active states and since // this is for the start ID, we use a slots slice that is // guaranteed to have the right length but where every element // is absent. This is exactly what we want, because this // epsilon closure is responsible for simulating an unanchored // '(?s:.)*?' prefix. It is specifically outside of any // capturing groups, and thus, using slots that are always // absent is correct. // // Note though that we can't just use '&mut []' here, since // this epsilon closure may traverse through 'Captures' epsilon // transitions, and thus must be able to write offsets to the // slots given which are later copied to slot values in 'curr'. let slots = next.slot_table.all_absent(); self.epsilon_closure(stack, slots, curr, input, at, start_id); } if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) { hm = Some(HalfMatch::new(pid, at)); } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will // quit right after seeing a match when match_kind==LeftmostFirst, // as is consistent with leftmost-first match priority.) if input.get_earliest() && hm.is_some() { break; } core::mem::swap(curr, next); next.set.clear(); at += 1; } instrument!(|c| c.eprint(&self.nfa)); hm } /// The implementation for the 'which_overlapping_matches' API. Basically, /// we do a single scan through the entire haystack (unless our regex /// or search is anchored) and record every pattern that matched. In /// particular, when MatchKind::All is used, this supports overlapping /// matches. So if we have the regexes 'sam' and 'samwise', they will /// *both* be reported in the pattern set when searching the haystack /// 'samwise'. fn which_overlapping_imp( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { // NOTE: This is effectively a copy of 'search_imp' above, but with no // captures support and instead writes patterns that matched directly // to 'patset'. See that routine for better commentary about what's // going on in this routine. We probably could unify the routines using // generics or more helper routines, but I'm not sure it's worth it. // // NOTE: We somewhat go out of our way here to support things like // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither // of those seem particularly relevant to this routine, but they are // both supported by the DFA analogs of this routine by construction // and composition, so it seems like good sense to have the PikeVM // match that behavior. cache.setup_search(0); if input.is_done() { return; } assert!( input.haystack().len() < core::usize::MAX, "byte slice lengths must be less than usize MAX", ); instrument!(|c| c.reset(&self.nfa)); let allmatches = self.config.get_match_kind().continue_past_first_match(); let (anchored, start_id) = match self.start_config(input) { None => return, Some(config) => config, }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { if any_matches && !allmatches { break; } if anchored && at > input.start() { break; } } if !any_matches || allmatches { let slots = &mut []; self.epsilon_closure(stack, slots, curr, input, at, start_id); } self.nexts_overlapping(stack, curr, next, input, at, patset); // If we found a match and filled our set, then there is no more // additional info that we can provide. Thus, we can quit. We also // quit if the caller asked us to stop at the earliest point that // we know a match exists. if patset.is_full() || input.get_earliest() { break; } core::mem::swap(curr, next); next.set.clear(); } instrument!(|c| c.eprint(&self.nfa)); } /// Process the active states in 'curr' to find the states (written to /// 'next') we should process for the next byte in the haystack. /// /// 'stack' is used to perform a depth first traversal of the NFA when /// computing an epsilon closure. /// /// When a match is found, the slots for that match state (in 'curr') are /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' /// stops (unless the PikeVM was configured with MatchKind::All semantics). #[cfg_attr(feature = "perf-inline", inline(always))] fn nexts( &self, stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, input: &Input<'_>, at: usize, slots: &mut [Option], ) -> Option { instrument!(|c| c.record_state_set(&curr.set)); let mut pid = None; let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { pid = match self.next(stack, slot_table, next, input, at, sid) { None => continue, Some(pid) => Some(pid), }; slots.copy_from_slice(slot_table.for_state(sid)); if !self.config.get_match_kind().continue_past_first_match() { break; } } pid } /// Like 'nexts', but for the overlapping case. This doesn't write any /// slots, and instead just writes which pattern matched in 'patset'. #[cfg_attr(feature = "perf-inline", inline(always))] fn nexts_overlapping( &self, stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, input: &Input<'_>, at: usize, patset: &mut PatternSet, ) { instrument!(|c| c.record_state_set(&curr.set)); let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { let pid = match self.next(stack, slot_table, next, input, at, sid) { None => continue, Some(pid) => pid, }; // This handles the case of finding a zero-width match that splits // a codepoint. Namely, if we're in UTF-8 mode AND we know we can // match the empty string, then the only valid way of getting to // this point with an offset that splits a codepoint is when we // have an empty match. Such matches, in UTF-8 mode, must not be // reported. So we just skip them here and pretend as if we did // not see a match. if utf8empty && !input.is_char_boundary(at) { continue; } let _ = patset.try_insert(pid); if !self.config.get_match_kind().continue_past_first_match() { break; } } } /// Starting from 'sid', if the position 'at' in the 'input' haystack has a /// transition defined out of 'sid', then add the state transitioned to and /// its epsilon closure to the 'next' set of states to explore. /// /// 'stack' is used by the epsilon closure computation to perform a depth /// first traversal of the NFA. /// /// 'curr_slot_table' should be the table of slots for the current set of /// states being explored. If there is a transition out of 'sid', then /// sid's row in the slot table is used to perform the epsilon closure. #[cfg_attr(feature = "perf-inline", inline(always))] fn next( &self, stack: &mut Vec, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, input: &Input<'_>, at: usize, sid: StateID, ) -> Option { instrument!(|c| c.record_step(sid)); match *self.nfa.state(sid) { State::Fail | State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } | State::Capture { .. } => None, State::ByteRange { ref trans } => { if trans.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); // OK because 'at <= haystack.len() < usize::MAX', so // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( stack, slots, next, input, at, trans.next, ); } None } State::Sparse(ref sparse) => { if let Some(next_sid) = sparse.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); // OK because 'at <= haystack.len() < usize::MAX', so // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( stack, slots, next, input, at, next_sid, ); } None } State::Dense(ref dense) => { if let Some(next_sid) = dense.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); // OK because 'at <= haystack.len() < usize::MAX', so // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( stack, slots, next, input, at, next_sid, ); } None } State::Match { pattern_id } => Some(pattern_id), } } /// Compute the epsilon closure of 'sid', writing the closure into 'next' /// while copying slot values from 'curr_slots' into corresponding states /// in 'next'. 'curr_slots' should be the slot values corresponding to /// 'sid'. /// /// The given 'stack' is used to perform a depth first traversal of the /// NFA by recursively following all epsilon transitions out of 'sid'. /// Conditional epsilon transitions are followed if and only if they are /// satisfied for the position 'at' in the 'input' haystack. /// /// While this routine may write to 'curr_slots', once it returns, any /// writes are undone and the original values (even if absent) are /// restored. #[cfg_attr(feature = "perf-inline", inline(always))] fn epsilon_closure( &self, stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, input: &Input<'_>, at: usize, sid: StateID, ) { instrument!(|c| { c.record_closure(sid); c.record_stack_push(sid); }); stack.push(FollowEpsilon::Explore(sid)); while let Some(frame) = stack.pop() { match frame { FollowEpsilon::RestoreCapture { slot, offset: pos } => { curr_slots[slot] = pos; } FollowEpsilon::Explore(sid) => { self.epsilon_closure_explore( stack, curr_slots, next, input, at, sid, ); } } } } /// Explore all of the epsilon transitions out of 'sid'. This is mostly /// split out from 'epsilon_closure' in order to clearly delineate /// the actual work of computing an epsilon closure from the stack /// book-keeping. /// /// This will push any additional explorations needed on to 'stack'. /// /// 'curr_slots' should refer to the slots for the currently active NFA /// state. That is, the current state we are stepping through. These /// slots are mutated in place as new 'Captures' states are traversed /// during epsilon closure, but the slots are restored to their original /// values once the full epsilon closure is completed. The ultimate use of /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that /// the capturing group spans are forwarded from the currently active state /// to the next. /// /// 'next' refers to the next set of active states. Computing an epsilon /// closure may increase the next set of active states. /// /// 'input' refers to the caller's input configuration and 'at' refers to /// the current position in the haystack. These are used to check whether /// conditional epsilon transitions (like look-around) are satisfied at /// the current position. If they aren't, then the epsilon closure won't /// include them. #[cfg_attr(feature = "perf-inline", inline(always))] fn epsilon_closure_explore( &self, stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, input: &Input<'_>, at: usize, mut sid: StateID, ) { // We can avoid pushing some state IDs on to our stack in precisely // the cases where a 'push(x)' would be immediately followed by a 'x // = pop()'. This is achieved by this outer-loop. We simply set 'sid' // to be the next state ID we want to explore once we're done with // our initial exploration. In practice, this avoids a lot of stack // thrashing. loop { instrument!(|c| c.record_set_insert(sid)); // Record this state as part of our next set of active states. If // we've already explored it, then no need to do it again. if !next.set.insert(sid) { return; } match *self.nfa.state(sid) { State::Fail | State::Match { .. } | State::ByteRange { .. } | State::Sparse { .. } | State::Dense { .. } => { next.slot_table.for_state(sid).copy_from_slice(curr_slots); return; } State::Look { look, next } => { // OK because we don't permit building a searcher with a // Unicode word boundary if the requisite Unicode data is // unavailable. if !self.nfa.look_matcher().matches_inline( look, input.haystack(), at, ) { return; } sid = next; } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return, Some(&sid) => sid, }; instrument!(|c| { for &alt in &alternates[1..] { c.record_stack_push(alt); } }); stack.extend( alternates[1..] .iter() .copied() .rev() .map(FollowEpsilon::Explore), ); } State::BinaryUnion { alt1, alt2 } => { sid = alt1; instrument!(|c| c.record_stack_push(sid)); stack.push(FollowEpsilon::Explore(alt2)); } State::Capture { next, slot, .. } => { // There's no need to do anything with slots that // ultimately won't be copied into the caller-provided // 'Captures' value. So we just skip dealing with them at // all. if slot.as_usize() < curr_slots.len() { instrument!(|c| c.record_stack_push(sid)); stack.push(FollowEpsilon::RestoreCapture { slot, offset: curr_slots[slot], }); // OK because length of a slice must fit into an isize. curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); } sid = next; } } } } /// Return the starting configuration of a PikeVM search. /// /// The "start config" is basically whether the search should be anchored /// or not and the NFA state ID at which to begin the search. The state ID /// returned always corresponds to an anchored starting state even when the /// search is unanchored. This is because the PikeVM search loop deals with /// unanchored searches with an explicit epsilon closure out of the start /// state. /// /// This routine accounts for both the caller's `Input` configuration /// and the pattern itself. For example, even if the caller asks for an /// unanchored search, if the pattern itself is anchored, then this will /// always return 'true' because implementing an unanchored search in that /// case would be incorrect. /// /// Similarly, if the caller requests an anchored search for a particular /// pattern, then the starting state ID returned will reflect that. /// /// If a pattern ID is given in the input configuration that is not in /// this regex, then `None` is returned. fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { match input.get_anchored() { // Only way we're unanchored is if both the caller asked for an // unanchored search *and* the pattern is itself not anchored. Anchored::No => Some(( self.nfa.is_always_start_anchored(), self.nfa.start_anchored(), )), Anchored::Yes => Some((true, self.nfa.start_anchored())), Anchored::Pattern(pid) => { Some((true, self.nfa.start_pattern(pid)?)) } } } } /// An iterator over all non-overlapping matches for a particular search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the PikeVM. /// * `'c` represents the lifetime of the PikeVM's cache. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`PikeVM::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, 'c, 'h> { re: &'r PikeVM, cache: &'c mut Cache, caps: Captures, it: iter::Searcher<'h>, } impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { type Item = Match; #[inline] fn next(&mut self) -> Option { // Splitting 'self' apart seems necessary to appease borrowck. let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = *self; // 'advance' converts errors into panics, which is OK here because // the PikeVM can never return an error. it.advance(|input| { re.search(cache, input, caps); Ok(caps.get_match()) }) } } /// An iterator over all non-overlapping leftmost matches, with their capturing /// groups, for a particular search. /// /// The iterator yields a [`Captures`] value until no more matches could be /// found. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the PikeVM. /// * `'c` represents the lifetime of the PikeVM's cache. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`PikeVM::captures_iter`] method. #[derive(Debug)] pub struct CapturesMatches<'r, 'c, 'h> { re: &'r PikeVM, cache: &'c mut Cache, caps: Captures, it: iter::Searcher<'h>, } impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { type Item = Captures; #[inline] fn next(&mut self) -> Option { // Splitting 'self' apart seems necessary to appease borrowck. let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = *self; // 'advance' converts errors into panics, which is OK here because // the PikeVM can never return an error. it.advance(|input| { re.search(cache, input, caps); Ok(caps.get_match()) }); if caps.is_match() { Some(caps.clone()) } else { None } } } /// A cache represents mutable state that a [`PikeVM`] requires during a /// search. /// /// For a given [`PikeVM`], its corresponding cache may be created either via /// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in /// every way, except the former does not require explicitly importing `Cache`. /// /// A particular `Cache` is coupled with the [`PikeVM`] from which it /// was created. It may only be used with that `PikeVM`. A cache and its /// allocations may be re-purposed via [`Cache::reset`], in which case, it can /// only be used with the new `PikeVM` (and not the old one). #[derive(Clone, Debug)] pub struct Cache { /// Stack used while computing epsilon closure. This effectively lets us /// move what is more naturally expressed through recursion to a stack /// on the heap. stack: Vec, /// The current active states being explored for the current byte in the /// haystack. curr: ActiveStates, /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, } impl Cache { /// Create a new [`PikeVM`] cache. /// /// A potentially more convenient routine to create a cache is /// [`PikeVM::create_cache`], as it does not require also importing the /// `Cache` type. /// /// If you want to reuse the returned `Cache` with some other `PikeVM`, /// then you must call [`Cache::reset`] with the desired `PikeVM`. pub fn new(re: &PikeVM) -> Cache { Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), } } /// Reset this cache such that it can be used for searching with a /// different [`PikeVM`]. /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `PikeVM`. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `PikeVM`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re1 = PikeVM::new(r"\w")?; /// let re2 = PikeVM::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.find_iter(&mut cache, "Δ").next(), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the PikeVM we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// cache.reset(&re2); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.find_iter(&mut cache, "☃").next(), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, re: &PikeVM) { self.curr.reset(re); self.next.reset(re); } /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { use core::mem::size_of; (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() } /// Clears this cache. This should be called at the start of every search /// to ensure we start with a clean slate. /// /// This also sets the length of the capturing groups used in the current /// search. This permits an optimization where by 'SlotTable::for_state' /// only returns the number of slots equivalent to the number of slots /// given in the 'Captures' value. This may be less than the total number /// of possible slots, e.g., when one only wants to track overall match /// offsets. This in turn permits less copying of capturing group spans /// in the PikeVM. fn setup_search(&mut self, captures_slot_len: usize) { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); } } /// A set of active states used to "simulate" the execution of an NFA via the /// PikeVM. /// /// There are two sets of these used during NFA simulation. One set corresponds /// to the "current" set of states being traversed for the current position /// in a haystack. The other set corresponds to the "next" set of states being /// built, which will become the new "current" set for the next position in the /// haystack. These two sets correspond to CLIST and NLIST in Thompson's /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 /// /// In addition to representing a set of NFA states, this also maintains slot /// values for each state. These slot values are what turn the NFA simulation /// into the "Pike VM." Namely, they track capturing group values for each /// state. During the computation of epsilon closure, we copy slot values from /// states in the "current" set to the "next" set. Eventually, once a match /// is found, the slot values for that match state are what we write to the /// caller provided 'Captures' value. #[derive(Clone, Debug)] struct ActiveStates { /// The set of active NFA states. This set preserves insertion order, which /// is critical for simulating the match semantics of backtracking regex /// engines. set: SparseSet, /// The slots for every NFA state, where each slot stores a (possibly /// absent) offset. Every capturing group has two slots. One for a start /// offset and one for an end offset. slot_table: SlotTable, } impl ActiveStates { /// Create a new set of active states for the given PikeVM. The active /// states returned may only be used with the given PikeVM. (Use 'reset' /// to re-purpose the allocation for a different PikeVM.) fn new(re: &PikeVM) -> ActiveStates { let mut active = ActiveStates { set: SparseSet::new(0), slot_table: SlotTable::new(), }; active.reset(re); active } /// Reset this set of active states such that it can be used with the given /// PikeVM (and only that PikeVM). fn reset(&mut self, re: &PikeVM) { self.set.resize(re.get_nfa().states().len()); self.slot_table.reset(re); } /// Return the heap memory usage, in bytes, used by this set of active /// states. /// /// This does not include the stack size of this value. fn memory_usage(&self) -> usize { self.set.memory_usage() + self.slot_table.memory_usage() } /// Setup this set of active states for a new search. The given slot /// length should be the number of slots in a caller provided 'Captures' /// (and may be zero). fn setup_search(&mut self, captures_slot_len: usize) { self.set.clear(); self.slot_table.setup_search(captures_slot_len); } } /// A table of slots, where each row represent a state in an NFA. Thus, the /// table has room for storing slots for every single state in an NFA. /// /// This table is represented with a single contiguous allocation. In general, /// the notion of "capturing group" doesn't really exist at this level of /// abstraction, hence the name "slot" instead. (Indeed, every capturing group /// maps to a pair of slots, one for the start offset and one for the end /// offset.) Slots are indexed by the 'Captures' NFA state. /// /// N.B. Not every state actually needs a row of slots. Namely, states that /// only have epsilon transitions currently never have anything written to /// their rows in this table. Thus, the table is somewhat wasteful in its heap /// usage. However, it is important to maintain fast random access by state /// ID, which means one giant table tends to work well. RE2 takes a different /// approach here and allocates each row as its own reference counted thing. /// I explored such a strategy at one point here, but couldn't get it to work /// well using entirely safe code. (To the ambitious reader: I encourage you to /// re-litigate that experiment.) I very much wanted to stick to safe code, but /// could be convinced otherwise if there was a solid argument and the safety /// was encapsulated well. #[derive(Clone, Debug)] struct SlotTable { /// The actual table of offsets. table: Vec>, /// The number of slots per state, i.e., the table's stride or the length /// of each row. slots_per_state: usize, /// The number of slots in the caller-provided 'Captures' value for the /// current search. Setting this to 'slots_per_state' is always correct, /// but may be wasteful. slots_for_captures: usize, } impl SlotTable { /// Create a new slot table. /// /// One should call 'reset' with the corresponding PikeVM before use. fn new() -> SlotTable { SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } } /// Reset this slot table such that it can be used with the given PikeVM /// (and only that PikeVM). fn reset(&mut self, re: &PikeVM) { let nfa = re.get_nfa(); self.slots_per_state = nfa.group_info().slot_len(); // This is always correct, but may be reduced for a particular search // if a 'Captures' has fewer slots, e.g., none at all or only slots // for tracking the overall match instead of all slots for every // group. self.slots_for_captures = core::cmp::max( self.slots_per_state, nfa.pattern_len().checked_mul(2).unwrap(), ); let len = nfa .states() .len() .checked_mul(self.slots_per_state) // Add space to account for scratch space used during a search. .and_then(|x| x.checked_add(self.slots_for_captures)) // It seems like this could actually panic on legitimate inputs on // 32-bit targets, and very likely to panic on 16-bit. Should we // somehow convert this to an error? What about something similar // for the lazy DFA cache? If you're tripping this assert, please // file a bug. .expect("slot table length doesn't overflow"); // This happens about as often as a regex is compiled, so it probably // should be at debug level, but I found it quite distracting and not // particularly useful. trace!( "resizing PikeVM active states table to {} entries \ (slots_per_state={})", len, self.slots_per_state, ); self.table.resize(len, None); } /// Return the heap memory usage, in bytes, used by this slot table. /// /// This does not include the stack size of this value. fn memory_usage(&self) -> usize { self.table.len() * core::mem::size_of::>() } /// Perform any per-search setup for this slot table. /// /// In particular, this sets the length of the number of slots used in the /// 'Captures' given by the caller (if any at all). This number may be /// smaller than the total number of slots available, e.g., when the caller /// is only interested in tracking the overall match and not the spans of /// every matching capturing group. Only tracking the overall match can /// save a substantial amount of time copying capturing spans during a /// search. fn setup_search(&mut self, captures_slot_len: usize) { self.slots_for_captures = captures_slot_len; } /// Return a mutable slice of the slots for the given state. /// /// Note that the length of the slice returned may be less than the total /// number of slots available for this state. In particular, the length /// always matches the number of slots indicated via 'setup_search'. fn for_state(&mut self, sid: StateID) -> &mut [Option] { let i = sid.as_usize() * self.slots_per_state; &mut self.table[i..i + self.slots_for_captures] } /// Return a slice of slots of appropriate length where every slot offset /// is guaranteed to be absent. This is useful in cases where you need to /// compute an epsilon closure outside of the user supplied regex, and thus /// never want it to have any capturing slots set. fn all_absent(&mut self) -> &mut [Option] { let i = self.table.len() - self.slots_for_captures; &mut self.table[i..i + self.slots_for_captures] } } /// Represents a stack frame for use while computing an epsilon closure. /// /// (An "epsilon closure" refers to the set of reachable NFA states from a /// single state without consuming any input. That is, the set of all epsilon /// transitions not only from that single state, but from every other state /// reachable by an epsilon transition as well. This is why it's called a /// "closure." Computing an epsilon closure is also done during DFA /// determinization! Compare and contrast the epsilon closure here in this /// PikeVM and the one used for determinization in crate::util::determinize.) /// /// Computing the epsilon closure in a Thompson NFA proceeds via a depth /// first traversal over all epsilon transitions from a particular state. /// (A depth first traversal is important because it emulates the same priority /// of matches that is typically found in backtracking regex engines.) This /// depth first traversal is naturally expressed using recursion, but to avoid /// a call stack size proportional to the size of a regex, we put our stack on /// the heap instead. /// /// This stack thus consists of call frames. The typical call frame is /// `Explore`, which instructs epsilon closure to explore the epsilon /// transitions from that state. (Subsequent epsilon transitions are then /// pushed on to the stack as more `Explore` frames.) If the state ID being /// explored has no epsilon transitions, then the capturing group slots are /// copied from the original state that sparked the epsilon closure (from the /// 'step' routine) to the state ID being explored. This way, capturing group /// slots are forwarded from the previous state to the next. /// /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to /// set the position for a particular slot back to some particular offset. This /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will /// set the offset of the slot indicated in `Capture` to the current offset, /// and then push the old offset on to the stack as a `RestoreCapture` frame. /// Thus, the new offset is only used until the epsilon closure reverts back to /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon /// transition its "scope" to only states that come "after" it during depth /// first traversal. #[derive(Clone, Debug)] enum FollowEpsilon { /// Explore the epsilon transitions from a state ID. Explore(StateID), /// Reset the given `slot` to the given `offset` (which might be `None`). RestoreCapture { slot: SmallIndex, offset: Option }, } /// A set of counters that "instruments" a PikeVM search. To enable this, you /// must enable the 'internal-instrument-pikevm' feature. Then run your Rust /// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in /// the environment. The metrics collected will be dumped automatically for /// every search executed by the PikeVM. /// /// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an /// absolute decrease in wall-clock performance, even if the 'trace' log level /// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't /// enabled.) The main point of instrumentation is to get counts of various /// events that occur during the PikeVM's execution. /// /// This is a somewhat hacked together collection of metrics that are useful /// to gather from a PikeVM search. In particular, it lets us scrutinize the /// performance profile of a search beyond what general purpose profiling tools /// give us. Namely, we orient the profiling data around the specific states of /// the NFA. /// /// In other words, this lets us see which parts of the NFA graph are most /// frequently activated. This then provides direction for optimization /// opportunities. /// /// The really sad part about this is that it absolutely clutters up the PikeVM /// implementation. :'( Another approach would be to just manually add this /// code in whenever I want this kind of profiling data, but it's complicated /// and tedious enough that I went with this approach... for now. /// /// When instrumentation is enabled (which also turns on 'logging'), then a /// `Counters` is initialized for every search and `trace`'d just before the /// search returns to the caller. /// /// Tip: When debugging performance problems with the PikeVM, it's best to try /// to work with an NFA that is as small as possible. Otherwise the state graph /// is likely to be too big to digest. #[cfg(feature = "internal-instrument-pikevm")] #[derive(Clone, Debug)] struct Counters { /// The number of times the NFA is in a particular permutation of states. state_sets: alloc::collections::BTreeMap, u64>, /// The number of times 'step' is called for a particular state ID (which /// indexes this array). steps: Vec, /// The number of times an epsilon closure was computed for a state. closures: Vec, /// The number of times a particular state ID is pushed on to a stack while /// computing an epsilon closure. stack_pushes: Vec, /// The number of times a particular state ID is inserted into a sparse set /// while computing an epsilon closure. set_inserts: Vec, } #[cfg(feature = "internal-instrument-pikevm")] impl Counters { fn empty() -> Counters { Counters { state_sets: alloc::collections::BTreeMap::new(), steps: vec![], closures: vec![], stack_pushes: vec![], set_inserts: vec![], } } fn reset(&mut self, nfa: &NFA) { let len = nfa.states().len(); self.state_sets.clear(); self.steps.clear(); self.steps.resize(len, 0); self.closures.clear(); self.closures.resize(len, 0); self.stack_pushes.clear(); self.stack_pushes.resize(len, 0); self.set_inserts.clear(); self.set_inserts.resize(len, 0); } fn eprint(&self, nfa: &NFA) { trace!("===== START PikeVM Instrumentation Output ====="); // We take the top-K most occurring state sets. Otherwise the output // is likely to be overwhelming. And we probably only care about the // most frequently occurring ones anyway. const LIMIT: usize = 20; let mut set_counts = self.state_sets.iter().collect::, &u64)>>(); set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); trace!("## PikeVM frequency of state sets (top {})", LIMIT); for (set, count) in set_counts.iter().take(LIMIT) { trace!("{:?}: {}", set, count); } if set_counts.len() > LIMIT { trace!( "... {} sets omitted (out of {} total)", set_counts.len() - LIMIT, set_counts.len(), ); } trace!(""); trace!("## PikeVM total frequency of events"); trace!( "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}", self.steps.iter().copied().sum::(), self.closures.iter().copied().sum::(), self.stack_pushes.iter().copied().sum::(), self.set_inserts.iter().copied().sum::(), ); trace!(""); trace!("## PikeVM frequency of events broken down by state"); for sid in 0..self.steps.len() { trace!( "{:06}: steps: {}, closures: {}, \ stack-pushes: {}, set-inserts: {}", sid, self.steps[sid], self.closures[sid], self.stack_pushes[sid], self.set_inserts[sid], ); } trace!(""); trace!("## NFA debug display"); trace!("{:?}", nfa); trace!("===== END PikeVM Instrumentation Output ====="); } fn record_state_set(&mut self, set: &SparseSet) { let set = set.iter().collect::>(); *self.state_sets.entry(set).or_insert(0) += 1; } fn record_step(&mut self, sid: StateID) { self.steps[sid] += 1; } fn record_closure(&mut self, sid: StateID) { self.closures[sid] += 1; } fn record_stack_push(&mut self, sid: StateID) { self.stack_pushes[sid] += 1; } fn record_set_insert(&mut self, sid: StateID) { self.set_inserts[sid] += 1; } }