diff options
Diffstat (limited to 'vendor/regex-automata/src/nfa')
-rw-r--r-- | vendor/regex-automata/src/nfa/mod.rs | 54 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/backtrack.rs | 1884 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/builder.rs | 1337 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/compiler.rs | 2106 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/error.rs | 132 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/literal_trie.rs | 528 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/map.rs | 38 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/mod.rs | 1624 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/nfa.rs | 2101 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/pikevm.rs | 2409 | ||||
-rw-r--r-- | vendor/regex-automata/src/nfa/thompson/range_trie.rs | 342 |
11 files changed, 9692 insertions, 2863 deletions
diff --git a/vendor/regex-automata/src/nfa/mod.rs b/vendor/regex-automata/src/nfa/mod.rs index 61ce5ef47..0c36f598a 100644 --- a/vendor/regex-automata/src/nfa/mod.rs +++ b/vendor/regex-automata/src/nfa/mod.rs @@ -1 +1,55 @@ +/*! +Provides non-deterministic finite automata (NFA) and regex engines that use +them. + +While NFAs and DFAs (deterministic finite automata) have equivalent *theoretical* +power, their usage in practice tends to result in different engineering trade +offs. While this isn't meant to be a comprehensive treatment of the topic, here +are a few key trade offs that are, at minimum, true for this crate: + +* NFAs tend to be represented sparsely where as DFAs are represented densely. +Sparse representations use less memory, but are slower to traverse. Conversely, +dense representations use more memory, but are faster to traverse. (Sometimes +these lines are blurred. For example, an `NFA` might choose to represent a +particular state in a dense fashion, and a DFA can be built using a sparse +representation via [`sparse::DFA`](crate::dfa::sparse::DFA). +* NFAs have espilon transitions and DFAs don't. In practice, this means that +handling a single byte in a haystack with an NFA at search time may require +visiting multiple NFA states. In a DFA, each byte only requires visiting +a single state. Stated differently, NFAs require a variable number of CPU +instructions to process one byte in a haystack where as a DFA uses a constant +number of CPU instructions to process one byte. +* NFAs are generally easier to amend with secondary storage. For example, the +[`thompson::pikevm::PikeVM`] uses an NFA to match, but also uses additional +memory beyond the model of a finite state machine to track offsets for matching +capturing groups. Conversely, the most a DFA can do is report the offset (and +pattern ID) at which a match occurred. This is generally why we also compile +DFAs in reverse, so that we can run them after finding the end of a match to +also find the start of a match. +* NFAs take worst case linear time to build, but DFAs take worst case +exponential time to build. The [hybrid NFA/DFA](crate::hybrid) mitigates this +challenge for DFAs in many practical cases. + +There are likely other differences, but the bottom line is that NFAs tend to be +more memory efficient and give easier opportunities for increasing expressive +power, where as DFAs are faster to search with. + +# Why only a Thompson NFA? + +Currently, the only kind of NFA we support in this crate is a [Thompson +NFA](https://en.wikipedia.org/wiki/Thompson%27s_construction). This refers +to a specific construction algorithm that takes the syntax of a regex +pattern and converts it to an NFA. Specifically, it makes gratuitous use of +epsilon transitions in order to keep its structure simple. In exchange, its +construction time is linear in the size of the regex. A Thompson NFA also makes +the guarantee that given any state and a character in a haystack, there is at +most one transition defined for it. (Although there may be many epsilon +transitions.) + +It possible that other types of NFAs will be added in the future, such as a +[Glushkov NFA](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm). +But currently, this crate only provides a Thompson NFA. +*/ + +#[cfg(feature = "nfa-thompson")] pub mod thompson; diff --git a/vendor/regex-automata/src/nfa/thompson/backtrack.rs b/vendor/regex-automata/src/nfa/thompson/backtrack.rs new file mode 100644 index 000000000..eba037c1d --- /dev/null +++ b/vendor/regex-automata/src/nfa/thompson/backtrack.rs @@ -0,0 +1,1884 @@ +/*! +An NFA backed bounded backtracker for executing regex searches with capturing +groups. + +This module provides a [`BoundedBacktracker`] that works by simulating an NFA +using the classical backtracking algorithm with a twist: it avoids redoing +work that it has done before and thereby avoids worst case exponential time. +In exchange, it can only be used on "short" haystacks. Its advantage is that +is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases +because it does less book-keeping. +*/ + +use alloc::{vec, vec::Vec}; + +use crate::{ + nfa::thompson::{self, BuildError, State, NFA}, + util::{ + captures::Captures, + empty, iter, + prefilter::Prefilter, + primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, + search::{Anchored, HalfMatch, Input, Match, MatchError, Span}, + }, +}; + +/// Returns the minimum visited capacity for the given haystack. +/// +/// This function can be used as the argument to [`Config::visited_capacity`] +/// in order to guarantee that a backtracking search for the given `input` +/// won't return an error when using a [`BoundedBacktracker`] built from the +/// given `NFA`. +/// +/// This routine exists primarily as a way to test that the bounded backtracker +/// works correctly when its capacity is set to the smallest possible amount. +/// Still, it may be useful in cases where you know you want to use the bounded +/// backtracker for a specific input, and just need to know what visited +/// capacity to provide to make it work. +/// +/// Be warned that this number could be quite large as it is multiplicative in +/// the size the given NFA and haystack. +pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize { + div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8) +} + +/// The configuration used for building a bounded backtracker. +/// +/// A bounded backtracker configuration is a simple data object that is +/// typically used with [`Builder::configure`]. +#[derive(Clone, Debug, Default)] +pub struct Config { + pre: Option<Option<Prefilter>>, + visited_capacity: Option<usize>, +} + +impl Config { + /// Return a new default regex configuration. + pub fn new() -> Config { + Config::default() + } + + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// util::prefilter::Prefilter, + /// Input, Match, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = BoundedBacktracker::builder() + /// .configure(BoundedBacktracker::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!( + /// Some(Match::must(0, 5..11)), + /// re.try_find(&mut cache, input)?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = BoundedBacktracker::builder() + /// .configure(BoundedBacktracker::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// // No match reported even though there clearly is one! + /// assert_eq!(None, re.try_find(&mut cache, input)?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); + self + } + + /// Set the visited capacity used to bound backtracking. + /// + /// The visited capacity represents the amount of heap memory (in bytes) to + /// allocate toward tracking which parts of the backtracking search have + /// been done before. The heap memory needed for any particular search is + /// proportional to `haystack.len() * nfa.states().len()`, which an be + /// quite large. Therefore, the bounded backtracker is typically only able + /// to run on shorter haystacks. + /// + /// For a given regex, increasing the visited capacity means that the + /// maximum haystack length that can be searched is increased. The + /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum. + /// + /// The default capacity is a reasonable but empirically chosen size. + /// + /// # Example + /// + /// As with other regex engines, Unicode is what tends to make the bounded + /// backtracker less useful by making the maximum haystack length quite + /// small. If necessary, increasing the visited capacity using this routine + /// will increase the maximum haystack length at the cost of using more + /// memory. + /// + /// Note though that the specific maximum values here are not an API + /// guarantee. The default visited capacity is subject to change and not + /// covered by semver. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// // Unicode inflates the size of the underlying NFA quite a bit, and + /// // thus means that the backtracker can only handle smaller haystacks, + /// // assuming that the visited capacity remains unchanged. + /// let re = BoundedBacktracker::new(r"\w+")?; + /// assert!(re.max_haystack_len() <= 7_000); + /// // But we can increase the visited capacity to handle bigger haystacks! + /// let re = BoundedBacktracker::builder() + /// .configure(BoundedBacktracker::config().visited_capacity(1<<20)) + /// .build(r"\w+")?; + /// assert!(re.max_haystack_len() >= 25_000); + /// assert!(re.max_haystack_len() <= 28_000); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn visited_capacity(mut self, capacity: usize) -> Config { + self.visited_capacity = Some(capacity); + self + } + + /// Returns the prefilter set in this configuration, if one at all. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() + } + + /// Returns the configured visited capacity. + /// + /// Note that the actual capacity used may be slightly bigger than the + /// configured capacity. + pub fn get_visited_capacity(&self) -> usize { + const DEFAULT: usize = 256 * (1 << 10); // 256 KB + self.visited_capacity.unwrap_or(DEFAULT) + } + + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { + Config { + pre: o.pre.or_else(|| self.pre.clone()), + visited_capacity: o.visited_capacity.or(self.visited_capacity), + } + } +} + +/// A builder for a bounded backtracker. +/// +/// This builder permits configuring options for the syntax of a pattern, the +/// NFA construction and the `BoundedBacktracker` construction. This builder +/// is different from a general purpose regex builder in that it permits fine +/// grain configuration of the construction process. The trade off for this is +/// complexity, and the possibility of setting a configuration that might not +/// make sense. For example, there are two different UTF-8 modes: +/// +/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls +/// whether the pattern itself can contain sub-expressions that match invalid +/// UTF-8. +/// * [`thompson::Config::utf8`] controls how the regex iterators themselves +/// advance the starting position of the next search when a match with zero +/// length is found. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{self, backtrack::BoundedBacktracker}, +/// util::syntax, +/// Match, +/// }; +/// +/// let re = BoundedBacktracker::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(Ok(Match::must(0, 1..9))); +/// let got = re.try_find_iter(&mut cache, haystack).next(); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on a BoundedBacktracker Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct Builder { + config: Config, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, +} + +impl Builder { + /// Create a new BoundedBacktracker builder with its default configuration. + pub fn new() -> Builder { + Builder { + config: Config::default(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), + } + } + + /// Build a `BoundedBacktracker` from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build( + &self, + pattern: &str, + ) -> Result<BoundedBacktracker, BuildError> { + self.build_many(&[pattern]) + } + + /// Build a `BoundedBacktracker` from the given patterns. + #[cfg(feature = "syntax")] + pub fn build_many<P: AsRef<str>>( + &self, + patterns: &[P], + ) -> Result<BoundedBacktracker, BuildError> { + let nfa = self.thompson.build_many(patterns)?; + self.build_from_nfa(nfa) + } + + /// Build a `BoundedBacktracker` directly from its NFA. + /// + /// Note that when using this method, any configuration that applies to the + /// construction of the NFA itself will of course be ignored, since the NFA + /// given here is already built. + pub fn build_from_nfa( + &self, + nfa: NFA, + ) -> Result<BoundedBacktracker, BuildError> { + nfa.look_set_any().available().map_err(BuildError::word)?; + Ok(BoundedBacktracker { config: self.config.clone(), nfa }) + } + + /// Apply the given `BoundedBacktracker` configuration options to this + /// builder. + pub fn configure(&mut self, config: Config) -> &mut Builder { + self.config = self.config.overwrite(config); + self + } + + /// Set the syntax configuration for this builder using + /// [`syntax::Config`](crate::util::syntax::Config). + /// + /// This permits setting things like case insensitivity, Unicode and multi + /// line mode. + /// + /// These settings only apply when constructing a `BoundedBacktracker` + /// directly from a pattern. + #[cfg(feature = "syntax")] + pub fn syntax( + &mut self, + config: crate::util::syntax::Config, + ) -> &mut Builder { + self.thompson.syntax(config); + self + } + + /// Set the Thompson NFA configuration for this builder using + /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). + /// + /// This permits setting things like if additional time should be spent + /// shrinking the size of the NFA. + /// + /// These settings only apply when constructing a `BoundedBacktracker` + /// directly from a pattern. + #[cfg(feature = "syntax")] + pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { + self.thompson.configure(config); + self + } +} + +/// A backtracking regex engine that bounds its execution to avoid exponential +/// blow-up. +/// +/// This regex engine only implements leftmost-first match semantics and +/// only supports leftmost searches. It effectively does the same thing as a +/// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because +/// it doesn't have to worry about copying capturing group spans for most NFA +/// states. Instead, the backtracker can maintain one set of captures (provided +/// by the caller) and never needs to copy them. In exchange, the backtracker +/// bounds itself to ensure it doesn't exhibit worst case exponential time. +/// This results in the backtracker only being able to handle short haystacks +/// given reasonable memory usage. +/// +/// # Searches may return an error! +/// +/// By design, this backtracking regex engine is bounded. This bound is +/// implemented by not visiting any combination of NFA state ID and position +/// in a haystack more than once. Thus, the total memory required to bound +/// backtracking is proportional to `haystack.len() * nfa.states().len()`. +/// This can obviously get quite large, since large haystacks aren't terribly +/// uncommon. To avoid using exorbitant memory, the capacity is bounded by +/// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total +/// capacity required for a particular regex and a haystack exceeds this +/// capacity, then the search routine will return an error. +/// +/// Unlike other regex engines that may return an error at search time (like +/// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded +/// backtracker will work for every haystack. Therefore, this regex engine +/// _only_ exposes fallible search routines to avoid the footgun of panicking +/// when running a search on a haystack that is too big. +/// +/// If one wants to use the fallible search APIs without handling the +/// error, the only way to guarantee an error won't occur from the +/// haystack length is to ensure the haystack length does not exceed +/// [`BoundedBacktracker::max_haystack_len`]. +/// +/// # Example: Unicode word boundaries +/// +/// This example shows that the bounded backtracker implements Unicode word +/// boundaries correctly by default. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match}; +/// +/// let re = BoundedBacktracker::new(r"\b\w+\b")?; +/// let mut cache = re.create_cache(); +/// +/// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс"); +/// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next()); +/// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next()); +/// assert_eq!(None, it.next()); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: multiple regex patterns +/// +/// The bounded backtracker supports searching for multiple patterns +/// simultaneously, just like other regex engines. Note though that because it +/// uses a backtracking strategy, this regex engine is unlikely to scale well +/// as more patterns are added. But then again, as more patterns are added, the +/// maximum haystack length allowed will also shorten (assuming the visited +/// capacity remains invariant). +/// +/// ``` +/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match}; +/// +/// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?; +/// let mut cache = re.create_cache(); +/// +/// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux"); +/// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next()); +/// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next()); +/// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next()); +/// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next()); +/// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next()); +/// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next()); +/// assert_eq!(None, it.next()); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug)] +pub struct BoundedBacktracker { + config: Config, + nfa: NFA, +} + +impl BoundedBacktracker { + /// Parse the given regular expression using the default configuration and + /// return the corresponding `BoundedBacktracker`. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Ok(Match::must(0, 3..14))), + /// re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> { + BoundedBacktracker::builder().build(pattern) + } + + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next()); + /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next()); + /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next()); + /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<BoundedBacktracker, BuildError> { + BoundedBacktracker::builder().build_many(patterns) + } + + /// # Example + /// + /// This shows how to hand assemble a regular expression via its HIR, + /// compile an NFA from it and build a BoundedBacktracker from the NFA. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{NFA, backtrack::BoundedBacktracker}, + /// Match, + /// }; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = BoundedBacktracker::new_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let expected = Some(Match::must(0, 3..4)); + /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> { + BoundedBacktracker::builder().build_from_nfa(nfa) + } + + /// Create a new `BoundedBacktracker` that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::always_match()?; + /// let mut cache = re.create_cache(); + /// + /// let expected = Some(Ok(Match::must(0, 0..0))); + /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next()); + /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<BoundedBacktracker, BuildError> { + let nfa = thompson::NFA::always_match(); + BoundedBacktracker::new_from_nfa(nfa) + } + + /// Create a new `BoundedBacktracker` that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::never_match()?; + /// let mut cache = re.create_cache(); + /// + /// assert_eq!(None, re.try_find_iter(&mut cache, "").next()); + /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<BoundedBacktracker, BuildError> { + let nfa = thompson::NFA::never_match(); + BoundedBacktracker::new_from_nfa(nfa) + } + + /// Return a default configuration for a `BoundedBacktracker`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a `BoundedBacktracker`. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode. When UTF-8 mode is + /// disabled, zero-width matches that split a codepoint are allowed. + /// Otherwise they are never reported. + /// + /// In the code below, notice that `""` is permitted to match positions + /// that split the encoding of a codepoint. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, backtrack::BoundedBacktracker}, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z"; + /// let mut it = re.try_find_iter(&mut cache, haystack); + /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next()); + /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn config() -> Config { + Config::new() + } + + /// Return a builder for configuring the construction of a + /// `BoundedBacktracker`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::{self, backtrack::BoundedBacktracker}, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 1..9)); + /// re.try_captures(&mut cache, haystack, &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn builder() -> Builder { + Builder::new() + } + + /// Create a new cache for this regex. + /// + /// The cache returned should only be used for searches for this + /// regex. If you want to reuse the cache for another regex, then you + /// must call [`Cache::reset`] with that regex (or, equivalently, + /// [`BoundedBacktracker::reset_cache`]). + pub fn create_cache(&self) -> Cache { + Cache::new(self) + } + + /// Create a new empty set of capturing groups that is guaranteed to be + /// valid for the search APIs on this `BoundedBacktracker`. + /// + /// A `Captures` value created for a specific `BoundedBacktracker` cannot + /// be used with any other `BoundedBacktracker`. + /// + /// This is a convenience function for [`Captures::all`]. See the + /// [`Captures`] documentation for an explanation of its alternative + /// constructors that permit the `BoundedBacktracker` to do less work + /// during a search, and thus might make it faster. + pub fn create_captures(&self) -> Captures { + Captures::all(self.get_nfa().group_info().clone()) + } + + /// Reset the given cache such that it can be used for searching with the + /// this `BoundedBacktracker` (and only this `BoundedBacktracker`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `BoundedBacktracker`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different + /// `BoundedBacktracker`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re1 = BoundedBacktracker::new(r"\w")?; + /// let re2 = BoundedBacktracker::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Ok(Match::must(0, 0..2))), + /// re1.try_find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the BoundedBacktracker we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Ok(Match::must(0, 0..3))), + /// re2.try_find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + cache.reset(self); + } + + /// Returns the total number of patterns compiled into this + /// `BoundedBacktracker`. + /// + /// In the case of a `BoundedBacktracker` that contains no patterns, this + /// returns `0`. + /// + /// # Example + /// + /// This example shows the pattern length for a `BoundedBacktracker` that + /// never matches: + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::never_match()?; + /// assert_eq!(re.pattern_len(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a `BoundedBacktracker` that matches at every + /// position: + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::always_match()?; + /// assert_eq!(re.pattern_len(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a `BoundedBacktracker` that was constructed from multiple + /// patterns: + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(re.pattern_len(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + self.nfa.pattern_len() + } + + /// Return the config for this `BoundedBacktracker`. + #[inline] + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Returns a reference to the underlying NFA. + #[inline] + pub fn get_nfa(&self) -> &NFA { + &self.nfa + } + + /// Returns the maximum haystack length supported by this backtracker. + /// + /// This routine is a function of both [`Config::visited_capacity`] and the + /// internal size of the backtracker's NFA. + /// + /// # Example + /// + /// This example shows how the maximum haystack length can vary depending + /// on the size of the regex itself. Note though that the specific maximum + /// values here are not an API guarantee. The default visited capacity is + /// subject to change and not covered by semver. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, MatchError, + /// }; + /// + /// // If you're only using ASCII, you get a big budget. + /// let re = BoundedBacktracker::new(r"(?-u)\w+")?; + /// let mut cache = re.create_cache(); + /// assert_eq!(re.max_haystack_len(), 299_592); + /// // Things work up to the max. + /// let mut haystack = "a".repeat(299_592); + /// let expected = Some(Ok(Match::must(0, 0..299_592))); + /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next()); + /// // But you'll get an error if you provide a haystack that's too big. + /// // Notice that we use the 'try_find_iter' routine instead, which + /// // yields Result<Match, MatchError> instead of Match. + /// haystack.push('a'); + /// let expected = Some(Err(MatchError::haystack_too_long(299_593))); + /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next()); + /// + /// // Unicode inflates the size of the underlying NFA quite a bit, and + /// // thus means that the backtracker can only handle smaller haystacks, + /// // assuming that the visited capacity remains unchanged. + /// let re = BoundedBacktracker::new(r"\w+")?; + /// assert!(re.max_haystack_len() <= 7_000); + /// // But we can increase the visited capacity to handle bigger haystacks! + /// let re = BoundedBacktracker::builder() + /// .configure(BoundedBacktracker::config().visited_capacity(1<<20)) + /// .build(r"\w+")?; + /// assert!(re.max_haystack_len() >= 25_000); + /// assert!(re.max_haystack_len() <= 28_000); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn max_haystack_len(&self) -> usize { + // The capacity given in the config is "bytes of heap memory," but the + // capacity we use here is "number of bits." So convert the capacity in + // bytes to the capacity in bits. + let capacity = 8 * self.get_config().get_visited_capacity(); + let blocks = div_ceil(capacity, Visited::BLOCK_SIZE); + let real_capacity = blocks * Visited::BLOCK_SIZE; + (real_capacity / self.nfa.states().len()) - 1 + } +} + +impl BoundedBacktracker { + /// Returns true if and only if this regex matches the given haystack. + /// + /// In the case of a backtracking regex engine, and unlike most other + /// regex engines in this crate, short circuiting isn't practical. However, + /// this routine may still be faster because it instructs backtracking to + /// not keep track of any capturing groups. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; + /// + /// let re = BoundedBacktracker::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, "foo12345bar")?); + /// assert!(!re.try_is_match(&mut cache, "foobar")?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: consistency with search APIs + /// + /// `is_match` is guaranteed to return `true` whenever `find` returns a + /// match. This includes searches that are executed entirely within a + /// codepoint: + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Input, + /// }; + /// + /// let re = BoundedBacktracker::new("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Notice that when UTF-8 mode is disabled, then the above reports a + /// match because the restriction against zero-width matches that split a + /// codepoint has been lifted: + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{backtrack::BoundedBacktracker, NFA}, + /// Input, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(NFA::config().utf8(false)) + /// .build("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Result<bool, MatchError> { + let input = input.into().earliest(true); + self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some()) + } + + /// Executes a leftmost forward search and returns a `Match` if one exists. + /// + /// This routine only includes the overall match span. To get + /// access to the individual spans of each capturing group, use + /// [`BoundedBacktracker::try_captures`]. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re = BoundedBacktracker::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..8); + /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Result<Option<Match>, MatchError> { + let input = input.into(); + if self.get_nfa().pattern_len() == 1 { + let mut slots = [None, None]; + let pid = match self.try_search_slots(cache, &input, &mut slots)? { + None => return Ok(None), + Some(pid) => pid, + }; + let start = match slots[0] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[1] { + None => return Ok(None), + Some(s) => s.get(), + }; + return Ok(Some(Match::new(pid, Span { start, end }))); + } + let ginfo = self.get_nfa().group_info(); + let slots_len = ginfo.implicit_slot_len(); + let mut slots = vec![None; slots_len]; + let pid = match self.try_search_slots(cache, &input, &mut slots)? { + None => return Ok(None), + Some(pid) => pid, + }; + let start = match slots[pid.as_usize() * 2] { + None => return Ok(None), + Some(s) => s.get(), + }; + let end = match slots[pid.as_usize() * 2 + 1] { + None => return Ok(None), + Some(s) => s.get(), + }; + Ok(Some(Match::new(pid, Span { start, end }))) + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Span, + /// }; + /// + /// let re = BoundedBacktracker::new( + /// r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$", + /// )?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?; + /// assert!(caps.is_match()); + /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); + /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_captures<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + caps: &mut Captures, + ) -> Result<(), MatchError> { + self.try_search(cache, &input.into(), caps) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// If the regex engine returns an error at any point, then the iterator + /// will yield that error. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, MatchError, + /// }; + /// + /// let re = BoundedBacktracker::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let result: Result<Vec<Match>, MatchError> = re + /// .try_find_iter(&mut cache, text) + /// .collect(); + /// let matches = result?; + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> TryFindMatches<'r, 'c, 'h> { + let caps = Captures::matches(self.get_nfa().group_info().clone()); + let it = iter::Searcher::new(input.into()); + TryFindMatches { re: self, cache, caps, it } + } + + /// Returns an iterator over all non-overlapping `Captures` values. If no + /// match exists, then the iterator yields no elements. + /// + /// This yields the same matches as [`BoundedBacktracker::try_find_iter`], + /// but it includes the spans of all capturing groups that participate in + /// each match. + /// + /// If the regex engine returns an error at any point, then the iterator + /// will yield that error. + /// + /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for + /// how to correctly iterate over all matches in a haystack while avoiding + /// the creation of a new `Captures` value for every match. (Which you are + /// forced to do with an `Iterator`.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Span, + /// }; + /// + /// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let mut spans = vec![]; + /// for result in re.try_captures_iter(&mut cache, text) { + /// let caps = result?; + /// // The unwrap is OK since 'numbers' matches if the pattern matches. + /// spans.push(caps.get_group_by_name("numbers").unwrap()); + /// } + /// assert_eq!(spans, vec![ + /// Span::from(3..4), + /// Span::from(8..10), + /// Span::from(14..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> TryCapturesMatches<'r, 'c, 'h> { + let caps = self.create_captures(); + let it = iter::Searcher::new(input.into()); + TryCapturesMatches { re: self, cache, caps, it } + } +} + +impl BoundedBacktracker { + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// This is like [`BoundedBacktracker::try_captures`], but it accepts a + /// concrete `&Input` instead of an `Into<Input>`. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi bounded backtracker that + /// permits searching for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Anchored, Input, Match, PatternID, + /// }; + /// + /// let re = BoundedBacktracker::new_many(&[ + /// "[a-z0-9]{6}", + /// "[a-z][a-z0-9]{5}", + /// ])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123"; + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(Match::must(0, 0..6)); + /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(Match::must(1, 0..6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// re.try_search(&mut cache, &input, &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, Input, + /// }; + /// + /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?; + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// re.try_search( + /// &mut cache, &Input::new(haystack).range(3..6), &mut caps, + /// )?; + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search( + &self, + cache: &mut Cache, + input: &Input<'_>, + caps: &mut Captures, + ) -> Result<(), MatchError> { + caps.set_pattern(None); + let pid = self.try_search_slots(cache, input, caps.slots_mut())?; + caps.set_pattern(pid); + Ok(()) + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided `slots`, and + /// returns the matching pattern ID. The contents of the slots for patterns + /// other than the matching pattern are unspecified. If no match was found, + /// then `None` is returned and the contents of all `slots` is unspecified. + /// + /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw + /// slots slice instead of a `Captures` value. This is useful in contexts + /// where you don't want or need to allocate a `Captures`. + /// + /// It is legal to pass _any_ number of slots to this routine. If the regex + /// engine would otherwise write a slot offset that doesn't fit in the + /// provided slice, then it is simply skipped. In general though, there are + /// usually three slice lengths you might want to use: + /// + /// * An empty slice, if you only care about which pattern matched. + /// * A slice with + /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) + /// slots, if you only care about the overall match spans for each matching + /// pattern. + /// * A slice with + /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which + /// permits recording match offsets for every capturing group in every + /// pattern. + /// + /// # Errors + /// + /// This routine only errors if the search could not complete. For this + /// backtracking regex engine, this only occurs when the haystack length + /// exceeds [`BoundedBacktracker::max_haystack_len`]. + /// + /// When a search cannot complete, callers cannot know whether a match + /// exists or not. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// PatternID, Input, + /// }; + /// + /// let re = BoundedBacktracker::new_many(&[ + /// r"\pL+", + /// r"\d+", + /// ])?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("!@#123"); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?; + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn try_search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<PatternID>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + if !utf8empty { + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); + } + // See PikeVM::try_search_slots for why we do this. + let min = self.get_nfa().group_info().implicit_slot_len(); + if slots.len() >= min { + let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; + return Ok(maybe_hm.map(|hm| hm.pattern())); + } + if self.get_nfa().pattern_len() == 1 { + let mut enough = [None, None]; + let got = self.try_search_slots_imp(cache, input, &mut enough)?; + // This is OK because we know `enough_slots` is strictly bigger + // than `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + return Ok(got.map(|hm| hm.pattern())); + } + let mut enough = vec![None; min]; + let got = self.try_search_slots_imp(cache, input, &mut enough)?; + // This is OK because we know `enough_slots` is strictly bigger than + // `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + Ok(got.map(|hm| hm.pattern())) + } + + /// This is the actual implementation of `try_search_slots_imp` that + /// doesn't account for the special case when 1) the NFA has UTF-8 mode + /// enabled, 2) the NFA can match the empty string and 3) the caller has + /// provided an insufficient number of slots to record match offsets. + #[inline(never)] + fn try_search_slots_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<HalfMatch>, MatchError> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match self.search_imp(cache, input, slots)? { + None => return Ok(None), + Some(hm) if !utf8empty => return Ok(Some(hm)), + Some(hm) => hm, + }; + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots)? + .map(|hm| (hm, hm.offset()))) + }) + } + + /// The implementation of standard leftmost backtracking search. + /// + /// Capturing group spans are written to 'caps', but only if requested. + /// 'caps' can be one of three things: 1) totally empty, in which case, we + /// only report the pattern that matched or 2) only has slots for recording + /// the overall match offsets for any pattern or 3) has all slots available + /// for recording the spans of any groups participating in a match. + fn search_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Result<Option<HalfMatch>, MatchError> { + // Unlike in the PikeVM, we write our capturing group spans directly + // into the caller's captures groups. So we have to make sure we're + // starting with a blank slate first. In the PikeVM, we avoid this + // by construction: the spans that are copied to every slot in the + // 'Captures' value already account for presence/absence. In this + // backtracker, we write directly into the caller provided slots, where + // as in the PikeVM, we write into scratch space first and only copy + // them to the caller provided slots when a match is found. + for slot in slots.iter_mut() { + *slot = None; + } + cache.setup_search(&self, input)?; + if input.is_done() { + return Ok(None); + } + let (anchored, start_id) = match input.get_anchored() { + // Only way we're unanchored is if both the caller asked for an + // unanchored search *and* the pattern is itself not anchored. + Anchored::No => ( + self.nfa.is_always_start_anchored(), + // We always use the anchored starting state here, even if + // doing an unanchored search. The "unanchored" part of it is + // implemented in the loop below, by simply trying the next + // byte offset if the previous backtracking exploration failed. + self.nfa.start_anchored(), + ), + Anchored::Yes => (true, self.nfa.start_anchored()), + Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) { + None => return Ok(None), + Some(sid) => (true, sid), + }, + }; + if anchored { + let at = input.start(); + return Ok(self.backtrack(cache, input, at, start_id, slots)); + } + let pre = self.get_config().get_prefilter(); + let mut at = input.start(); + while at <= input.end() { + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => break, + Some(ref span) => at = span.start, + } + } + if let Some(hm) = self.backtrack(cache, input, at, start_id, slots) + { + return Ok(Some(hm)); + } + at += 1; + } + Ok(None) + } + + /// Look for a match starting at `at` in `input` and write the matching + /// pattern ID and group spans to `caps`. The search uses `start_id` as its + /// starting state in the underlying NFA. + /// + /// If no match was found, then the caller should increment `at` and try + /// at the next position. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn backtrack( + &self, + cache: &mut Cache, + input: &Input<'_>, + at: usize, + start_id: StateID, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + cache.stack.push(Frame::Step { sid: start_id, at }); + while let Some(frame) = cache.stack.pop() { + match frame { + Frame::Step { sid, at } => { + if let Some(hm) = self.step(cache, input, sid, at, slots) { + return Some(hm); + } + } + Frame::RestoreCapture { slot, offset } => { + slots[slot] = offset; + } + } + } + None + } + + // LAMENTATION: The actual backtracking search is implemented in about + // 75 lines below. Yet this file is over 2,000 lines long. What have I + // done? + + /// Execute a "step" in the backtracing algorithm. + /// + /// A "step" is somewhat of a misnomer, because this routine keeps going + /// until it either runs out of things to try or fins a match. In the + /// former case, it may have pushed some things on to the backtracking + /// stack, in which case, those will be tried next as part of the + /// 'backtrack' routine above. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn step( + &self, + cache: &mut Cache, + input: &Input<'_>, + mut sid: StateID, + mut at: usize, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + loop { + if !cache.visited.insert(sid, at - input.start()) { + return None; + } + match *self.nfa.state(sid) { + State::ByteRange { ref trans } => { + // Why do we need this? Unlike other regex engines in this + // crate, the backtracker can steam roll ahead in the + // haystack outside of the main loop over the bytes in the + // haystack. While 'trans.matches()' below handles the case + // of 'at' being out of bounds of 'input.haystack()', we + // also need to handle the case of 'at' going out of bounds + // of the span the caller asked to search. + // + // We should perhaps make the 'trans.matches()' API accept + // an '&Input' instead of a '&[u8]'. Or at least, add a new + // API that does it. + if at >= input.end() { + return None; + } + if !trans.matches(input.haystack(), at) { + return None; + } + sid = trans.next; + at += 1; + } + State::Sparse(ref sparse) => { + if at >= input.end() { + return None; + } + sid = sparse.matches(input.haystack(), at)?; + at += 1; + } + State::Dense(ref dense) => { + if at >= input.end() { + return None; + } + sid = dense.matches(input.haystack(), at)?; + at += 1; + } + State::Look { look, next } => { + // OK because we don't permit building a searcher with a + // Unicode word boundary if the requisite Unicode data is + // unavailable. + if !self.nfa.look_matcher().matches_inline( + look, + input.haystack(), + at, + ) { + return None; + } + sid = next; + } + State::Union { ref alternates } => { + sid = match alternates.get(0) { + None => return None, + Some(&sid) => sid, + }; + cache.stack.extend( + alternates[1..] + .iter() + .copied() + .rev() + .map(|sid| Frame::Step { sid, at }), + ); + } + State::BinaryUnion { alt1, alt2 } => { + sid = alt1; + cache.stack.push(Frame::Step { sid: alt2, at }); + } + State::Capture { next, slot, .. } => { + if slot.as_usize() < slots.len() { + cache.stack.push(Frame::RestoreCapture { + slot, + offset: slots[slot], + }); + slots[slot] = NonMaxUsize::new(at); + } + sid = next; + } + State::Fail => return None, + State::Match { pattern_id } => { + return Some(HalfMatch::new(pattern_id, at)); + } + } + } + } +} + +/// An iterator over all non-overlapping matches for a fallible search. +/// +/// The iterator yields a `Result<Match, MatchError` value until no more +/// matches could be found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the BoundedBacktracker. +/// * `'c` represents the lifetime of the BoundedBacktracker's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`BoundedBacktracker::try_find_iter`] +/// method. +#[derive(Debug)] +pub struct TryFindMatches<'r, 'c, 'h> { + re: &'r BoundedBacktracker, + cache: &'c mut Cache, + caps: Captures, + it: iter::Searcher<'h>, +} + +impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> { + type Item = Result<Match, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Match, MatchError>> { + // Splitting 'self' apart seems necessary to appease borrowck. + let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + it.try_advance(|input| { + re.try_search(cache, input, caps)?; + Ok(caps.get_match()) + }) + .transpose() + } +} + +/// An iterator over all non-overlapping leftmost matches, with their capturing +/// groups, for a fallible search. +/// +/// The iterator yields a `Result<Captures, MatchError>` value until no more +/// matches could be found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the BoundedBacktracker. +/// * `'c` represents the lifetime of the BoundedBacktracker's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the +/// [`BoundedBacktracker::try_captures_iter`] method. +#[derive(Debug)] +pub struct TryCapturesMatches<'r, 'c, 'h> { + re: &'r BoundedBacktracker, + cache: &'c mut Cache, + caps: Captures, + it: iter::Searcher<'h>, +} + +impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> { + type Item = Result<Captures, MatchError>; + + #[inline] + fn next(&mut self) -> Option<Result<Captures, MatchError>> { + // Splitting 'self' apart seems necessary to appease borrowck. + let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + let _ = it + .try_advance(|input| { + re.try_search(cache, input, caps)?; + Ok(caps.get_match()) + }) + .transpose()?; + if caps.is_match() { + Some(Ok(caps.clone())) + } else { + None + } + } +} + +/// A cache represents mutable state that a [`BoundedBacktracker`] requires +/// during a search. +/// +/// For a given [`BoundedBacktracker`], its corresponding cache may be created +/// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`]. +/// They are equivalent in every way, except the former does not require +/// explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which +/// it was created. It may only be used with that `BoundedBacktracker`. A cache +/// and its allocations may be re-purposed via [`Cache::reset`], in which case, +/// it can only be used with the new `BoundedBacktracker` (and not the old +/// one). +#[derive(Clone, Debug)] +pub struct Cache { + /// Stack used on the heap for doing backtracking instead of the + /// traditional recursive approach. We don't want recursion because then + /// we're likely to hit a stack overflow for bigger regexes. + stack: Vec<Frame>, + /// The set of (StateID, HaystackOffset) pairs that have been visited + /// by the backtracker within a single search. If such a pair has been + /// visited, then we avoid doing the work for that pair again. This is + /// what "bounds" the backtracking and prevents it from having worst case + /// exponential time. + visited: Visited, +} + +impl Cache { + /// Create a new [`BoundedBacktracker`] cache. + /// + /// A potentially more convenient routine to create a cache is + /// [`BoundedBacktracker::create_cache`], as it does not require also + /// importing the `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other + /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the + /// desired `BoundedBacktracker`. + pub fn new(re: &BoundedBacktracker) -> Cache { + Cache { stack: vec![], visited: Visited::new(re) } + } + + /// Reset this cache such that it can be used for searching with different + /// [`BoundedBacktracker`]. + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `BoundedBacktracker`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different + /// `BoundedBacktracker`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::backtrack::BoundedBacktracker, + /// Match, + /// }; + /// + /// let re1 = BoundedBacktracker::new(r"\w")?; + /// let re2 = BoundedBacktracker::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Ok(Match::must(0, 0..2))), + /// re1.try_find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the BoundedBacktracker we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Ok(Match::must(0, 0..3))), + /// re2.try_find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &BoundedBacktracker) { + self.visited.reset(re); + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + self.stack.len() * core::mem::size_of::<Frame>() + + self.visited.memory_usage() + } + + /// Clears this cache. This should be called at the start of every search + /// to ensure we start with a clean slate. + /// + /// This also sets the length of the capturing groups used in the current + /// search. This permits an optimization where by 'SlotTable::for_state' + /// only returns the number of slots equivalent to the number of slots + /// given in the 'Captures' value. This may be less than the total number + /// of possible slots, e.g., when one only wants to track overall match + /// offsets. This in turn permits less copying of capturing group spans + /// in the BoundedBacktracker. + fn setup_search( + &mut self, + re: &BoundedBacktracker, + input: &Input<'_>, + ) -> Result<(), MatchError> { + self.stack.clear(); + self.visited.setup_search(re, input)?; + Ok(()) + } +} + +/// Represents a stack frame on the heap while doing backtracking. +/// +/// Instead of using explicit recursion for backtracking, we use a stack on +/// the heap to keep track of things that we want to explore if the current +/// backtracking branch turns out to not lead to a match. +#[derive(Clone, Debug)] +enum Frame { + /// Look for a match starting at `sid` and the given position in the + /// haystack. + Step { sid: StateID, at: usize }, + /// Reset the given `slot` to the given `offset` (which might be `None`). + /// This effectively gives a "scope" to capturing groups, such that an + /// offset for a particular group only gets returned if the match goes + /// through that capturing group. If backtracking ends up going down a + /// different branch that results in a different offset (or perhaps none at + /// all), then this "restore capture" frame will cause the offset to get + /// reset. + RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, +} + +/// A bitset that keeps track of whether a particular (StateID, offset) has +/// been considered during backtracking. If it has already been visited, then +/// backtracking skips it. This is what gives backtracking its "bound." +#[derive(Clone, Debug)] +struct Visited { + /// The actual underlying bitset. Each element in the bitset corresponds + /// to a particular (StateID, offset) pair. States correspond to the rows + /// and the offsets correspond to the columns. + /// + /// If our underlying NFA has N states and the haystack we're searching + /// has M bytes, then we have N*(M+1) entries in our bitset table. The + /// M+1 occurs because our matches are delayed by one byte (to support + /// look-around), and so we need to handle the end position itself rather + /// than stopping just before the end. (If there is no end position, then + /// it's treated as "end-of-input," which is matched by things like '$'.) + /// + /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize)) + /// blocks. + /// + /// We use 'usize' to represent our blocks because it makes some of the + /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for + /// our block, we'd either need to cast u32s to usizes or usizes to u32s. + bitset: Vec<usize>, + /// The stride represents one plus length of the haystack we're searching + /// (as described above). The stride must be initialized for each search. + stride: usize, +} + +impl Visited { + /// The size of each block, in bits. + const BLOCK_SIZE: usize = 8 * core::mem::size_of::<usize>(); + + /// Create a new visited set for the given backtracker. + /// + /// The set is ready to use, but must be setup at the beginning of each + /// search by calling `setup_search`. + fn new(re: &BoundedBacktracker) -> Visited { + let mut visited = Visited { bitset: vec![], stride: 0 }; + visited.reset(re); + visited + } + + /// Insert the given (StateID, offset) pair into this set. If it already + /// exists, then this is a no-op and it returns false. Otherwise this + /// returns true. + fn insert(&mut self, sid: StateID, at: usize) -> bool { + let table_index = sid.as_usize() * self.stride + at; + let block_index = table_index / Visited::BLOCK_SIZE; + let bit = table_index % Visited::BLOCK_SIZE; + let block_with_bit = 1 << bit; + if self.bitset[block_index] & block_with_bit != 0 { + return false; + } + self.bitset[block_index] |= block_with_bit; + true + } + + /// Reset this visited set to work with the given bounded backtracker. + fn reset(&mut self, _: &BoundedBacktracker) { + self.bitset.truncate(0); + } + + /// Setup this visited set to work for a search using the given NFA + /// and input configuration. The NFA must be the same NFA used by the + /// BoundedBacktracker given to Visited::reset. Failing to call this might + /// result in panics or silently incorrect search behavior. + fn setup_search( + &mut self, + re: &BoundedBacktracker, + input: &Input<'_>, + ) -> Result<(), MatchError> { + // Our haystack length is only the length of the span of the entire + // haystack that we'll be searching. + let haylen = input.get_span().len(); + let err = || MatchError::haystack_too_long(haylen); + // Our stride is one more than the length of the input because our main + // search loop includes the position at input.end(). (And it does this + // because matches are delayed by one byte to account for look-around.) + self.stride = haylen + 1; + let needed_capacity = + match re.get_nfa().states().len().checked_mul(self.stride) { + None => return Err(err()), + Some(capacity) => capacity, + }; + let max_capacity = 8 * re.get_config().get_visited_capacity(); + if needed_capacity > max_capacity { + return Err(err()); + } + let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE); + self.bitset.truncate(needed_blocks); + for block in self.bitset.iter_mut() { + *block = 0; + } + if needed_blocks > self.bitset.len() { + self.bitset.resize(needed_blocks, 0); + } + Ok(()) + } + + /// Return the heap memory usage, in bytes, of this visited set. + fn memory_usage(&self) -> usize { + self.bitset.len() * core::mem::size_of::<usize>() + } +} + +/// Integer division, but rounds up instead of down. +fn div_ceil(lhs: usize, rhs: usize) -> usize { + if lhs % rhs == 0 { + lhs / rhs + } else { + (lhs / rhs) + 1 + } +} diff --git a/vendor/regex-automata/src/nfa/thompson/builder.rs b/vendor/regex-automata/src/nfa/thompson/builder.rs new file mode 100644 index 000000000..b57e5bc0f --- /dev/null +++ b/vendor/regex-automata/src/nfa/thompson/builder.rs @@ -0,0 +1,1337 @@ +use core::mem; + +use alloc::{sync::Arc, vec, vec::Vec}; + +use crate::{ + nfa::thompson::{ + error::BuildError, + nfa::{self, SparseTransitions, Transition, NFA}, + }, + util::{ + look::{Look, LookMatcher}, + primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, + }, +}; + +/// An intermediate NFA state used during construction. +/// +/// During construction of an NFA, it is often convenient to work with states +/// that are amenable to mutation and other carry more information than we +/// otherwise need once an NFA has been built. This type represents those +/// needs. +/// +/// Once construction is finished, the builder will convert these states to a +/// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not +/// only results in a simpler representation, but in some cases, entire classes +/// of states are completely removed (such as [`State::Empty`]). +#[derive(Clone, Debug, Eq, PartialEq)] +enum State { + /// An empty state whose only purpose is to forward the automaton to + /// another state via an unconditional epsilon transition. + /// + /// Unconditional epsilon transitions are quite useful during the + /// construction of an NFA, as they permit the insertion of no-op + /// placeholders that make it easier to compose NFA sub-graphs. When + /// the Thompson NFA builder produces a final NFA, all unconditional + /// epsilon transitions are removed, and state identifiers are remapped + /// accordingly. + Empty { + /// The next state that this state should transition to. + next: StateID, + }, + /// A state that only transitions to another state if the current input + /// byte is in a particular range of bytes. + ByteRange { trans: Transition }, + /// A state with possibly many transitions, represented in a sparse + /// fashion. Transitions must be ordered lexicographically by input range + /// and be non-overlapping. As such, this may only be used when every + /// transition has equal priority. (In practice, this is only used for + /// encoding large UTF-8 automata.) In contrast, a `Union` state has each + /// alternate in order of priority. Priority is used to implement greedy + /// matching and also alternations themselves, e.g., `abc|a` where `abc` + /// has priority over `a`. + /// + /// To clarify, it is possible to remove `Sparse` and represent all things + /// that `Sparse` is used for via `Union`. But this creates a more bloated + /// NFA with more epsilon transitions than is necessary in the special case + /// of character classes. + Sparse { transitions: Vec<Transition> }, + /// A conditional epsilon transition satisfied via some sort of + /// look-around. + Look { look: Look, next: StateID }, + /// An empty state that records the start of a capture location. This is an + /// unconditional epsilon transition like `Empty`, except it can be used to + /// record position information for a captue group when using the NFA for + /// search. + CaptureStart { + /// The ID of the pattern that this capture was defined. + pattern_id: PatternID, + /// The capture group index that this capture state corresponds to. + /// The capture group index is always relative to its corresponding + /// pattern. Therefore, in the presence of multiple patterns, both the + /// pattern ID and the capture group index are required to uniquely + /// identify a capturing group. + group_index: SmallIndex, + /// The next state that this state should transition to. + next: StateID, + }, + /// An empty state that records the end of a capture location. This is an + /// unconditional epsilon transition like `Empty`, except it can be used to + /// record position information for a captue group when using the NFA for + /// search. + CaptureEnd { + /// The ID of the pattern that this capture was defined. + pattern_id: PatternID, + /// The capture group index that this capture state corresponds to. + /// The capture group index is always relative to its corresponding + /// pattern. Therefore, in the presence of multiple patterns, both the + /// pattern ID and the capture group index are required to uniquely + /// identify a capturing group. + group_index: SmallIndex, + /// The next state that this state should transition to. + next: StateID, + }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { alternates: Vec<StateID> }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via later transitions are + /// preferred over earlier transitions. + /// + /// This "reverse" state exists for convenience during compilation that + /// permits easy construction of non-greedy combinations of NFA states. At + /// the end of compilation, Union and UnionReverse states are merged into + /// one Union type of state, where the latter has its epsilon transitions + /// reversed to reflect the priority inversion. + /// + /// The "convenience" here arises from the fact that as new states are + /// added to the list of `alternates`, we would like that add operation + /// to be amortized constant time. But if we used a `Union`, we'd need to + /// prepend the state, which takes O(n) time. There are other approaches we + /// could use to solve this, but this seems simple enough. + UnionReverse { alternates: Vec<StateID> }, + /// A state that cannot be transitioned out of. This is useful for cases + /// where you want to prevent matching from occurring. For example, if your + /// regex parser permits empty character classes, then one could choose a + /// `Fail` state to represent it. + Fail, + /// A match state. There is at most one such occurrence of this state in + /// an NFA for each pattern compiled into the NFA. At time of writing, a + /// match state is always produced for every pattern given, but in theory, + /// if a pattern can never lead to a match, then the match state could be + /// omitted. + /// + /// `pattern_id` refers to the ID of the pattern itself, which corresponds + /// to the pattern's index (starting at 0). + Match { pattern_id: PatternID }, +} + +impl State { + /// If this state is an unconditional espilon transition, then this returns + /// the target of the transition. + fn goto(&self) -> Option<StateID> { + match *self { + State::Empty { next } => Some(next), + State::Union { ref alternates } if alternates.len() == 1 => { + Some(alternates[0]) + } + State::UnionReverse { ref alternates } + if alternates.len() == 1 => + { + Some(alternates[0]) + } + _ => None, + } + } + + /// Returns the heap memory usage, in bytes, of this state. + fn memory_usage(&self) -> usize { + match *self { + State::Empty { .. } + | State::ByteRange { .. } + | State::Look { .. } + | State::CaptureStart { .. } + | State::CaptureEnd { .. } + | State::Fail + | State::Match { .. } => 0, + State::Sparse { ref transitions } => { + transitions.len() * mem::size_of::<Transition>() + } + State::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + State::UnionReverse { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } +} + +/// An abstraction for building Thompson NFAs by hand. +/// +/// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler) +/// uses internally to translate a regex's high-level intermediate +/// representation into an [`NFA`]. +/// +/// The primary function of this builder is to abstract away the internal +/// representation of an NFA and make it difficult to produce NFAs are that +/// internally invalid or inconsistent. This builder also provides a way to +/// add "empty" states (which can be thought of as unconditional epsilon +/// transitions), despite the fact that [`thompson::State`](nfa::State) does +/// not have any "empty" representation. The advantage of "empty" states is +/// that they make the code for constructing a Thompson NFA logically simpler. +/// +/// Many of the routines on this builder may panic or return errors. Generally +/// speaking, panics occur when an invalid sequence of method calls were made, +/// where as an error occurs if things get too big. (Where "too big" might mean +/// exhausting identifier space or using up too much heap memory in accordance +/// with the configured [`size_limit`](Builder::set_size_limit).) +/// +/// # Overview +/// +/// ## Adding multiple patterns +/// +/// Each pattern you add to an NFA should correspond to a pair of +/// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with +/// calls inbetween that add NFA states for that pattern. NFA states may be +/// added without first calling `start_pattern`, with the exception of adding +/// capturing states. +/// +/// ## Adding NFA states +/// +/// Here is a very brief overview of each of the methods that add NFA states. +/// Every method adds a single state. +/// +/// * [`add_empty`](Builder::add_empty): Add a state with a single +/// unconditional epsilon transition to another state. +/// * [`add_union`](Builder::add_union): Adds a state with unconditional +/// epsilon transitions to two or more states, with earlier transitions +/// preferred over later ones. +/// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with +/// unconditional epsilon transitions to two or more states, with later +/// transitions preferred over earlier ones. +/// * [`add_range`](Builder::add_range): Adds a state with a single transition +/// to another state that can only be followed if the current input byte is +/// within the range given. +/// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more +/// range transitions to other states, where a transition is only followed +/// if the current input byte is within one of the ranges. All transitions +/// in this state have equal priority, and the corresponding ranges must be +/// non-overlapping. +/// * [`add_look`](Builder::add_look): Adds a state with a single *conditional* +/// epsilon transition to another state, where the condition depends on a +/// limited look-around property. +/// * [`add_capture_start`](Builder::add_capture_start): Adds a state with +/// a single unconditional epsilon transition that also instructs an NFA +/// simulation to record the current input position to a specific location in +/// memory. This is intended to represent the starting location of a capturing +/// group. +/// * [`add_capture_end`](Builder::add_capture_end): Adds a state with +/// a single unconditional epsilon transition that also instructs an NFA +/// simulation to record the current input position to a specific location in +/// memory. This is intended to represent the ending location of a capturing +/// group. +/// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to +/// another state. +/// * [`add_match`](Builder::add_match): Add a state that indicates a match has +/// been found for a particular pattern. A match state is a final state with +/// no outgoing transitions. +/// +/// ## Setting transitions between NFA states +/// +/// The [`Builder::patch`] method creates a transition from one state to the +/// next. If the `from` state corresponds to a state that supports multiple +/// outgoing transitions (such as "union"), then this adds the corresponding +/// transition. Otherwise, it sets the single transition. (This routine panics +/// if `from` corresponds to a state added by `add_sparse`, since sparse states +/// need more specialized handling.) +/// +/// # Example +/// +/// This annotated example shows how to hand construct the regex `[a-z]+` +/// (without an unanchored prefix). +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{pikevm::PikeVM, Builder, Transition}, +/// util::primitives::StateID, +/// Match, +/// }; +/// +/// let mut builder = Builder::new(); +/// // Before adding NFA states for our pattern, we need to tell the builder +/// // that we are starting the pattern. +/// builder.start_pattern()?; +/// // Since we use the Pike VM below for searching, we need to add capturing +/// // states. If you're just going to build a DFA from the NFA, then capturing +/// // states do not need to be added. +/// let start = builder.add_capture_start(StateID::ZERO, 0, None)?; +/// let range = builder.add_range(Transition { +/// // We don't know the state ID of the 'next' state yet, so we just fill +/// // in a dummy 'ZERO' value. +/// start: b'a', end: b'z', next: StateID::ZERO, +/// })?; +/// // This state will point back to 'range', but also enable us to move ahead. +/// // That is, this implements the '+' repetition operator. We add 'range' and +/// // then 'end' below to this alternation. +/// let alt = builder.add_union(vec![])?; +/// // The final state before the match state, which serves to capture the +/// // end location of the match. +/// let end = builder.add_capture_end(StateID::ZERO, 0)?; +/// // The match state for our pattern. +/// let mat = builder.add_match()?; +/// // Now we fill in the transitions between states. +/// builder.patch(start, range)?; +/// builder.patch(range, alt)?; +/// // If we added 'end' before 'range', then we'd implement non-greedy +/// // matching, i.e., '+?'. +/// builder.patch(alt, range)?; +/// builder.patch(alt, end)?; +/// builder.patch(end, mat)?; +/// // We must explicitly finish pattern and provide the starting state ID for +/// // this particular pattern. +/// builder.finish_pattern(start)?; +/// // Finally, when we build the NFA, we provide the anchored and unanchored +/// // starting state IDs. Since we didn't bother with an unanchored prefix +/// // here, we only support anchored searching. Thus, both starting states are +/// // the same. +/// let nfa = builder.build(start, start)?; +/// +/// // Now build a Pike VM from our NFA, and use it for searching. This shows +/// // how we can use a regex engine without ever worrying about syntax! +/// let re = PikeVM::new_from_nfa(nfa)?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// let expected = Some(Match::must(0, 0..3)); +/// re.captures(&mut cache, "foo0", &mut caps); +/// assert_eq!(expected, caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone, Debug, Default)] +pub struct Builder { + /// The ID of the pattern that we're currently building. + /// + /// Callers are required to set (and unset) this by calling + /// {start,finish}_pattern. Otherwise, most methods will panic. + pattern_id: Option<PatternID>, + /// A sequence of intermediate NFA states. Once a state is added to this + /// sequence, it is assigned a state ID equivalent to its index. Once a + /// state is added, it is still expected to be mutated, e.g., to set its + /// transition to a state that didn't exist at the time it was added. + states: Vec<State>, + /// The starting states for each individual pattern. Starting at any + /// of these states will result in only an anchored search for the + /// corresponding pattern. The vec is indexed by pattern ID. When the NFA + /// contains a single regex, then `start_pattern[0]` and `start_anchored` + /// are always equivalent. + start_pattern: Vec<StateID>, + /// A map from pattern ID to capture group index to name. (If no name + /// exists, then a None entry is present. Thus, all capturing groups are + /// present in this mapping.) + /// + /// The outer vec is indexed by pattern ID, while the inner vec is indexed + /// by capture index offset for the corresponding pattern. + /// + /// The first capture group for each pattern is always unnamed and is thus + /// always None. + captures: Vec<Vec<Option<Arc<str>>>>, + /// The combined memory used by each of the 'State's in 'states'. This + /// only includes heap usage by each state, and not the size of the state + /// itself. In other words, this tracks heap memory used that isn't + /// captured via `size_of::<State>() * states.len()`. + memory_states: usize, + /// Whether this NFA only matches UTF-8 and whether regex engines using + /// this NFA for searching should report empty matches that split a + /// codepoint. + utf8: bool, + /// Whether this NFA should be matched in reverse or not. + reverse: bool, + /// The matcher to use for look-around assertions. + look_matcher: LookMatcher, + /// A size limit to respect when building an NFA. If the total heap memory + /// of the intermediate NFA states exceeds (or would exceed) this amount, + /// then an error is returned. + size_limit: Option<usize>, +} + +impl Builder { + /// Create a new builder for hand-assembling NFAs. + pub fn new() -> Builder { + Builder::default() + } + + /// Clear this builder. + /// + /// Clearing removes all state associated with building an NFA, but does + /// not reset configuration (such as size limits and whether the NFA + /// should only match UTF-8). After clearing, the builder can be reused to + /// assemble an entirely new NFA. + pub fn clear(&mut self) { + self.pattern_id = None; + self.states.clear(); + self.start_pattern.clear(); + self.captures.clear(); + self.memory_states = 0; + } + + /// Assemble a [`NFA`] from the states added so far. + /// + /// After building an NFA, more states may be added and `build` may be + /// called again. To reuse a builder to produce an entirely new NFA from + /// scratch, call the [`clear`](Builder::clear) method first. + /// + /// `start_anchored` refers to the ID of the starting state that anchored + /// searches should use. That is, searches who matches are limited to the + /// starting position of the search. + /// + /// `start_unanchored` refers to the ID of the starting state that + /// unanchored searches should use. This permits searches to report matches + /// that start after the beginning of the search. In cases where unanchored + /// searches are not supported, the unanchored starting state ID must be + /// the same as the anchored starting state ID. + /// + /// # Errors + /// + /// This returns an error if there was a problem producing the final NFA. + /// In particular, this might include an error if the capturing groups + /// added to this builder violate any of the invariants documented on + /// [`GroupInfo`](crate::util::captures::GroupInfo). + /// + /// # Panics + /// + /// If `start_pattern` was called, then `finish_pattern` must be called + /// before `build`, otherwise this panics. + /// + /// This may panic for other invalid uses of a builder. For example, if + /// a "start capture" state was added without a corresponding "end capture" + /// state. + pub fn build( + &self, + start_anchored: StateID, + start_unanchored: StateID, + ) -> Result<NFA, BuildError> { + assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first"); + debug!( + "intermediate NFA compilation via builder is complete, \ + intermediate NFA size: {} states, {} bytes on heap", + self.states.len(), + self.memory_usage(), + ); + + let mut nfa = nfa::Inner::default(); + nfa.set_utf8(self.utf8); + nfa.set_reverse(self.reverse); + nfa.set_look_matcher(self.look_matcher.clone()); + // A set of compiler internal state IDs that correspond to states + // that are exclusively epsilon transitions, i.e., goto instructions, + // combined with the state that they point to. This is used to + // record said states while transforming the compiler's internal NFA + // representation to the external form. + let mut empties = vec![]; + // A map used to re-map state IDs when translating this builder's + // internal NFA state representation to the final NFA representation. + let mut remap = vec![]; + remap.resize(self.states.len(), StateID::ZERO); + + nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); + nfa.set_captures(&self.captures).map_err(BuildError::captures)?; + // The idea here is to convert our intermediate states to their final + // form. The only real complexity here is the process of converting + // transitions, which are expressed in terms of state IDs. The new + // set of states will be smaller because of partial epsilon removal, + // so the state IDs will not be the same. + for (sid, state) in self.states.iter().with_state_ids() { + match *state { + State::Empty { next } => { + // Since we're removing empty states, we need to handle + // them later since we don't yet know which new state this + // empty state will be mapped to. + empties.push((sid, next)); + } + State::ByteRange { trans } => { + remap[sid] = nfa.add(nfa::State::ByteRange { trans }); + } + State::Sparse { ref transitions } => { + remap[sid] = match transitions.len() { + 0 => nfa.add(nfa::State::Fail), + 1 => nfa.add(nfa::State::ByteRange { + trans: transitions[0], + }), + _ => { + let transitions = + transitions.to_vec().into_boxed_slice(); + let sparse = SparseTransitions { transitions }; + nfa.add(nfa::State::Sparse(sparse)) + } + } + } + State::Look { look, next } => { + remap[sid] = nfa.add(nfa::State::Look { look, next }); + } + State::CaptureStart { pattern_id, group_index, next } => { + // We can't remove this empty state because of the side + // effect of capturing an offset for this capture slot. + let slot = nfa + .group_info() + .slot(pattern_id, group_index.as_usize()) + .expect("invalid capture index"); + let slot = + SmallIndex::new(slot).expect("a small enough slot"); + remap[sid] = nfa.add(nfa::State::Capture { + next, + pattern_id, + group_index, + slot, + }); + } + State::CaptureEnd { pattern_id, group_index, next } => { + // We can't remove this empty state because of the side + // effect of capturing an offset for this capture slot. + // Also, this always succeeds because we check that all + // slot indices are valid for all capture indices when they + // are initially added. + let slot = nfa + .group_info() + .slot(pattern_id, group_index.as_usize()) + .expect("invalid capture index") + .checked_add(1) + .unwrap(); + let slot = + SmallIndex::new(slot).expect("a small enough slot"); + remap[sid] = nfa.add(nfa::State::Capture { + next, + pattern_id, + group_index, + slot, + }); + } + State::Union { ref alternates } => { + if alternates.is_empty() { + remap[sid] = nfa.add(nfa::State::Fail); + } else if alternates.len() == 1 { + empties.push((sid, alternates[0])); + remap[sid] = alternates[0]; + } else if alternates.len() == 2 { + remap[sid] = nfa.add(nfa::State::BinaryUnion { + alt1: alternates[0], + alt2: alternates[1], + }); + } else { + let alternates = + alternates.to_vec().into_boxed_slice(); + remap[sid] = nfa.add(nfa::State::Union { alternates }); + } + } + State::UnionReverse { ref alternates } => { + if alternates.is_empty() { + remap[sid] = nfa.add(nfa::State::Fail); + } else if alternates.len() == 1 { + empties.push((sid, alternates[0])); + remap[sid] = alternates[0]; + } else if alternates.len() == 2 { + remap[sid] = nfa.add(nfa::State::BinaryUnion { + alt1: alternates[1], + alt2: alternates[0], + }); + } else { + let mut alternates = + alternates.to_vec().into_boxed_slice(); + alternates.reverse(); + remap[sid] = nfa.add(nfa::State::Union { alternates }); + } + } + State::Fail => { + remap[sid] = nfa.add(nfa::State::Fail); + } + State::Match { pattern_id } => { + remap[sid] = nfa.add(nfa::State::Match { pattern_id }); + } + } + } + // Some of the new states still point to empty state IDs, so we need to + // follow each of them and remap the empty state IDs to their non-empty + // state IDs. + // + // We also keep track of which states we've already mapped. This helps + // avoid quadratic behavior in a long chain of empty states. For + // example, in 'a{0}{50000}'. + let mut remapped = vec![false; self.states.len()]; + for &(empty_id, empty_next) in empties.iter() { + if remapped[empty_id] { + continue; + } + // empty states can point to other empty states, forming a chain. + // So we must follow the chain until the end, which must end at + // a non-empty state, and therefore, a state that is correctly + // remapped. We are guaranteed to terminate because our compiler + // never builds a loop among only empty states. + let mut new_next = empty_next; + while let Some(next) = self.states[new_next].goto() { + new_next = next; + } + remap[empty_id] = remap[new_next]; + remapped[empty_id] = true; + + // Now that we've remapped the main 'empty_id' above, we re-follow + // the chain from above and remap every empty state we found along + // the way to our ultimate non-empty target. We are careful to set + // 'remapped' to true for each such state. We thus will not need + // to re-compute this chain for any subsequent empty states in + // 'empties' that are part of this chain. + let mut next2 = empty_next; + while let Some(next) = self.states[next2].goto() { + remap[next2] = remap[new_next]; + remapped[next2] = true; + next2 = next; + } + } + // Finally remap all of the state IDs. + nfa.remap(&remap); + let final_nfa = nfa.into_nfa(); + debug!( + "NFA compilation via builder complete, \ + final NFA size: {} states, {} bytes on heap, \ + has empty? {:?}, utf8? {:?}", + final_nfa.states().len(), + final_nfa.memory_usage(), + final_nfa.has_empty(), + final_nfa.is_utf8(), + ); + Ok(final_nfa) + } + + /// Start the assembly of a pattern in this NFA. + /// + /// Upon success, this returns the identifier for the new pattern. + /// Identifiers start at `0` and are incremented by 1 for each new pattern. + /// + /// It is necessary to call this routine before adding capturing states. + /// Otherwise, any other NFA state may be added before starting a pattern. + /// + /// # Errors + /// + /// If the pattern identifier space is exhausted, then this returns an + /// error. + /// + /// # Panics + /// + /// If this is called while assembling another pattern (i.e., before + /// `finish_pattern` is called), then this panics. + pub fn start_pattern(&mut self) -> Result<PatternID, BuildError> { + assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first"); + + let proposed = self.start_pattern.len(); + let pid = PatternID::new(proposed) + .map_err(|_| BuildError::too_many_patterns(proposed))?; + self.pattern_id = Some(pid); + // This gets filled in when 'finish_pattern' is called. + self.start_pattern.push(StateID::ZERO); + Ok(pid) + } + + /// Finish the assembly of a pattern in this NFA. + /// + /// Upon success, this returns the identifier for the new pattern. + /// Identifiers start at `0` and are incremented by 1 for each new + /// pattern. This is the same identifier returned by the corresponding + /// `start_pattern` call. + /// + /// Note that `start_pattern` and `finish_pattern` pairs cannot be + /// interleaved or nested. A correct `finish_pattern` call _always_ + /// corresponds to the most recently called `start_pattern` routine. + /// + /// # Errors + /// + /// This currently never returns an error, but this is subject to change. + /// + /// # Panics + /// + /// If this is called without a corresponding `start_pattern` call, then + /// this panics. + pub fn finish_pattern( + &mut self, + start_id: StateID, + ) -> Result<PatternID, BuildError> { + let pid = self.current_pattern_id(); + self.start_pattern[pid] = start_id; + self.pattern_id = None; + Ok(pid) + } + + /// Returns the pattern identifier of the current pattern. + /// + /// # Panics + /// + /// If this doesn't occur after a `start_pattern` call and before the + /// corresponding `finish_pattern` call, then this panics. + pub fn current_pattern_id(&self) -> PatternID { + self.pattern_id.expect("must call 'start_pattern' first") + } + + /// Returns the number of patterns added to this builder so far. + /// + /// This only includes patterns that have had `finish_pattern` called + /// for them. + pub fn pattern_len(&self) -> usize { + self.start_pattern.len() + } + + /// Add an "empty" NFA state. + /// + /// An "empty" NFA state is a state with a single unconditional epsilon + /// transition to another NFA state. Such empty states are removed before + /// building the final [`NFA`] (which has no such "empty" states), but they + /// can be quite useful in the construction process of an NFA. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_empty(&mut self) -> Result<StateID, BuildError> { + self.add(State::Empty { next: StateID::ZERO }) + } + + /// Add a "union" NFA state. + /// + /// A "union" NFA state that contains zero or more unconditional epsilon + /// transitions to other NFA states. The order of these transitions + /// reflects a priority order where earlier transitions are preferred over + /// later transitions. + /// + /// Callers may provide an empty set of alternates to this method call, and + /// then later add transitions via `patch`. At final build time, a "union" + /// state with no alternates is converted to a "fail" state, and a "union" + /// state with exactly one alternate is treated as if it were an "empty" + /// state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_union( + &mut self, + alternates: Vec<StateID>, + ) -> Result<StateID, BuildError> { + self.add(State::Union { alternates }) + } + + /// Add a "reverse union" NFA state. + /// + /// A "reverse union" NFA state contains zero or more unconditional epsilon + /// transitions to other NFA states. The order of these transitions + /// reflects a priority order where later transitions are preferred + /// over earlier transitions. This is an inverted priority order when + /// compared to `add_union`. This is useful, for example, for implementing + /// non-greedy repetition operators. + /// + /// Callers may provide an empty set of alternates to this method call, and + /// then later add transitions via `patch`. At final build time, a "reverse + /// union" state with no alternates is converted to a "fail" state, and a + /// "reverse union" state with exactly one alternate is treated as if it + /// were an "empty" state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_union_reverse( + &mut self, + alternates: Vec<StateID>, + ) -> Result<StateID, BuildError> { + self.add(State::UnionReverse { alternates }) + } + + /// Add a "range" NFA state. + /// + /// A "range" NFA state is a state with one outgoing transition to another + /// state, where that transition may only be followed if the current input + /// byte falls between a range of bytes given. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_range( + &mut self, + trans: Transition, + ) -> Result<StateID, BuildError> { + self.add(State::ByteRange { trans }) + } + + /// Add a "sparse" NFA state. + /// + /// A "sparse" NFA state contains zero or more outgoing transitions, where + /// the transition to be followed (if any) is chosen based on whether the + /// current input byte falls in the range of one such transition. The + /// transitions given *must* be non-overlapping and in ascending order. (A + /// "sparse" state with no transitions is equivalent to a "fail" state.) + /// + /// A "sparse" state is like adding a "union" state and pointing it at a + /// bunch of "range" states, except that the different alternates have + /// equal priority. + /// + /// Note that a "sparse" state is the only state that cannot be patched. + /// This is because a "sparse" state has many transitions, each of which + /// may point to a different NFA state. Moreover, adding more such + /// transitions requires more than just an NFA state ID to point to. It + /// also requires a byte range. The `patch` routine does not support the + /// additional information required. Therefore, callers must ensure that + /// all outgoing transitions for this state are included when `add_sparse` + /// is called. There is no way to add more later. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + /// + /// # Panics + /// + /// This routine _may_ panic if the transitions given overlap or are not + /// in ascending order. + pub fn add_sparse( + &mut self, + transitions: Vec<Transition>, + ) -> Result<StateID, BuildError> { + self.add(State::Sparse { transitions }) + } + + /// Add a "look" NFA state. + /// + /// A "look" NFA state corresponds to a state with exactly one + /// *conditional* epsilon transition to another NFA state. Namely, it + /// represents one of a small set of simplistic look-around operators. + /// + /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), + /// and then change it later with [`patch`](Builder::patch). + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_look( + &mut self, + next: StateID, + look: Look, + ) -> Result<StateID, BuildError> { + self.add(State::Look { look, next }) + } + + /// Add a "start capture" NFA state. + /// + /// A "start capture" NFA state corresponds to a state with exactly one + /// outgoing unconditional epsilon transition to another state. Unlike + /// "empty" states, a "start capture" state also carries with it an + /// instruction for saving the current position of input to a particular + /// location in memory. NFA simulations, like the Pike VM, may use this + /// information to report the match locations of capturing groups in a + /// regex pattern. + /// + /// If the corresponding capturing group has a name, then callers should + /// include it here. + /// + /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), + /// and then change it later with [`patch`](Builder::patch). + /// + /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and + /// end states may be interleaved. Indeed, it is typical for many "start + /// capture" NFA states to appear before the first "end capture" state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded or if the given + /// capture index overflows `usize`. + /// + /// While the above are the only conditions in which this routine can + /// currently return an error, it is possible to call this method with an + /// inputs that results in the final `build()` step failing to produce an + /// NFA. For example, if one adds two distinct capturing groups with the + /// same name, then that will result in `build()` failing with an error. + /// + /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for + /// more information on what qualifies as valid capturing groups. + /// + /// # Example + /// + /// This example shows that an error occurs when one tries to add multiple + /// capturing groups with the same name to the same pattern. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::Builder, + /// util::primitives::StateID, + /// }; + /// + /// let name = Some(std::sync::Arc::from("foo")); + /// let mut builder = Builder::new(); + /// builder.start_pattern()?; + /// // 0th capture group should always be unnamed. + /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?; + /// // OK + /// builder.add_capture_start(StateID::ZERO, 1, name.clone())?; + /// // This is not OK, but 'add_capture_start' still succeeds. We don't + /// // get an error until we call 'build' below. Without this call, the + /// // call to 'build' below would succeed. + /// builder.add_capture_start(StateID::ZERO, 2, name.clone())?; + /// // Finish our pattern so we can try to build the NFA. + /// builder.finish_pattern(start)?; + /// let result = builder.build(start, start); + /// assert!(result.is_err()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// However, adding multiple capturing groups with the same name to + /// distinct patterns is okay: + /// + /// ``` + /// use std::sync::Arc; + /// + /// use regex_automata::{ + /// nfa::thompson::{pikevm::PikeVM, Builder, Transition}, + /// util::{ + /// captures::Captures, + /// primitives::{PatternID, StateID}, + /// }, + /// Span, + /// }; + /// + /// // Hand-compile the patterns '(?P<foo>[a-z])' and '(?P<foo>[A-Z])'. + /// let mut builder = Builder::new(); + /// // We compile them to support an unanchored search, which requires + /// // adding an implicit '(?s-u:.)*?' prefix before adding either pattern. + /// let unanchored_prefix = builder.add_union_reverse(vec![])?; + /// let any = builder.add_range(Transition { + /// start: b'\x00', end: b'\xFF', next: StateID::ZERO, + /// })?; + /// builder.patch(unanchored_prefix, any)?; + /// builder.patch(any, unanchored_prefix)?; + /// + /// // Compile an alternation that permits matching multiple patterns. + /// let alt = builder.add_union(vec![])?; + /// builder.patch(unanchored_prefix, alt)?; + /// + /// // Compile '(?P<foo>[a-z]+)'. + /// builder.start_pattern()?; + /// let start0 = builder.add_capture_start(StateID::ZERO, 0, None)?; + /// // N.B. 0th capture group must always be unnamed. + /// let foo_start0 = builder.add_capture_start( + /// StateID::ZERO, 1, Some(Arc::from("foo")), + /// )?; + /// let lowercase = builder.add_range(Transition { + /// start: b'a', end: b'z', next: StateID::ZERO, + /// })?; + /// let foo_end0 = builder.add_capture_end(StateID::ZERO, 1)?; + /// let end0 = builder.add_capture_end(StateID::ZERO, 0)?; + /// let match0 = builder.add_match()?; + /// builder.patch(start0, foo_start0)?; + /// builder.patch(foo_start0, lowercase)?; + /// builder.patch(lowercase, foo_end0)?; + /// builder.patch(foo_end0, end0)?; + /// builder.patch(end0, match0)?; + /// builder.finish_pattern(start0)?; + /// + /// // Compile '(?P<foo>[A-Z]+)'. + /// builder.start_pattern()?; + /// let start1 = builder.add_capture_start(StateID::ZERO, 0, None)?; + /// // N.B. 0th capture group must always be unnamed. + /// let foo_start1 = builder.add_capture_start( + /// StateID::ZERO, 1, Some(Arc::from("foo")), + /// )?; + /// let uppercase = builder.add_range(Transition { + /// start: b'A', end: b'Z', next: StateID::ZERO, + /// })?; + /// let foo_end1 = builder.add_capture_end(StateID::ZERO, 1)?; + /// let end1 = builder.add_capture_end(StateID::ZERO, 0)?; + /// let match1 = builder.add_match()?; + /// builder.patch(start1, foo_start1)?; + /// builder.patch(foo_start1, uppercase)?; + /// builder.patch(uppercase, foo_end1)?; + /// builder.patch(foo_end1, end1)?; + /// builder.patch(end1, match1)?; + /// builder.finish_pattern(start1)?; + /// + /// // Now add the patterns to our alternation that we started above. + /// builder.patch(alt, start0)?; + /// builder.patch(alt, start1)?; + /// + /// // Finally build the NFA. The first argument is the anchored starting + /// // state (the pattern alternation) where as the second is the + /// // unanchored starting state (the unanchored prefix). + /// let nfa = builder.build(alt, unanchored_prefix)?; + /// + /// // Now build a Pike VM from our NFA and access the 'foo' capture + /// // group regardless of which pattern matched, since it is defined + /// // for both patterns. + /// let vm = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = vm.create_cache(); + /// let caps: Vec<Captures> = + /// vm.captures_iter(&mut cache, "0123aAaAA").collect(); + /// assert_eq!(5, caps.len()); + /// + /// assert_eq!(Some(PatternID::must(0)), caps[0].pattern()); + /// assert_eq!(Some(Span::from(4..5)), caps[0].get_group_by_name("foo")); + /// + /// assert_eq!(Some(PatternID::must(1)), caps[1].pattern()); + /// assert_eq!(Some(Span::from(5..6)), caps[1].get_group_by_name("foo")); + /// + /// assert_eq!(Some(PatternID::must(0)), caps[2].pattern()); + /// assert_eq!(Some(Span::from(6..7)), caps[2].get_group_by_name("foo")); + /// + /// assert_eq!(Some(PatternID::must(1)), caps[3].pattern()); + /// assert_eq!(Some(Span::from(7..8)), caps[3].get_group_by_name("foo")); + /// + /// assert_eq!(Some(PatternID::must(1)), caps[4].pattern()); + /// assert_eq!(Some(Span::from(8..9)), caps[4].get_group_by_name("foo")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn add_capture_start( + &mut self, + next: StateID, + group_index: u32, + name: Option<Arc<str>>, + ) -> Result<StateID, BuildError> { + let pid = self.current_pattern_id(); + let group_index = match SmallIndex::try_from(group_index) { + Err(_) => { + return Err(BuildError::invalid_capture_index(group_index)) + } + Ok(group_index) => group_index, + }; + // Make sure we have space to insert our (pid,index)|-->name mapping. + if pid.as_usize() >= self.captures.len() { + for _ in 0..=(pid.as_usize() - self.captures.len()) { + self.captures.push(vec![]); + } + } + // In the case where 'group_index < self.captures[pid].len()', it means + // that we are adding a duplicate capture group. This is somewhat + // weird, but permissible because the capture group itself can be + // repeated in the syntax. For example, '([a-z]){4}' will produce 4 + // capture groups. In practice, only the last will be set at search + // time when a match occurs. For duplicates, we don't need to push + // anything other than a CaptureStart NFA state. + if group_index.as_usize() >= self.captures[pid].len() { + // For discontiguous indices, push placeholders for earlier capture + // groups that weren't explicitly added. + for _ in 0..(group_index.as_usize() - self.captures[pid].len()) { + self.captures[pid].push(None); + } + self.captures[pid].push(name); + } + self.add(State::CaptureStart { pattern_id: pid, group_index, next }) + } + + /// Add a "end capture" NFA state. + /// + /// A "end capture" NFA state corresponds to a state with exactly one + /// outgoing unconditional epsilon transition to another state. Unlike + /// "empty" states, a "end capture" state also carries with it an + /// instruction for saving the current position of input to a particular + /// location in memory. NFA simulations, like the Pike VM, may use this + /// information to report the match locations of capturing groups in a + /// + /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), + /// and then change it later with [`patch`](Builder::patch). + /// + /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and + /// end states may be interleaved. Indeed, it is typical for many "start + /// capture" NFA states to appear before the first "end capture" state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded or if the given + /// capture index overflows `usize`. + /// + /// While the above are the only conditions in which this routine can + /// currently return an error, it is possible to call this method with an + /// inputs that results in the final `build()` step failing to produce an + /// NFA. For example, if one adds two distinct capturing groups with the + /// same name, then that will result in `build()` failing with an error. + /// + /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for + /// more information on what qualifies as valid capturing groups. + pub fn add_capture_end( + &mut self, + next: StateID, + group_index: u32, + ) -> Result<StateID, BuildError> { + let pid = self.current_pattern_id(); + let group_index = match SmallIndex::try_from(group_index) { + Err(_) => { + return Err(BuildError::invalid_capture_index(group_index)) + } + Ok(group_index) => group_index, + }; + self.add(State::CaptureEnd { pattern_id: pid, group_index, next }) + } + + /// Adds a "fail" NFA state. + /// + /// A "fail" state is simply a state that has no outgoing transitions. It + /// acts as a way to cause a search to stop without reporting a match. + /// For example, one way to represent an NFA with zero patterns is with a + /// single "fail" state. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + pub fn add_fail(&mut self) -> Result<StateID, BuildError> { + self.add(State::Fail) + } + + /// Adds a "match" NFA state. + /// + /// A "match" state has no outgoing transitions (just like a "fail" + /// state), but it has special significance in that if a search enters + /// this state, then a match has been found. The match state that is added + /// automatically has the current pattern ID associated with it. This is + /// used to report the matching pattern ID at search time. + /// + /// # Errors + /// + /// This returns an error if the state identifier space is exhausted, or if + /// the configured heap size limit has been exceeded. + /// + /// # Panics + /// + /// This must be called after a `start_pattern` call but before the + /// corresponding `finish_pattern` call. Otherwise, it panics. + pub fn add_match(&mut self) -> Result<StateID, BuildError> { + let pattern_id = self.current_pattern_id(); + let sid = self.add(State::Match { pattern_id })?; + Ok(sid) + } + + /// The common implementation of "add a state." It handles the common + /// error cases of state ID exhausting (by owning state ID allocation) and + /// whether the size limit has been exceeded. + fn add(&mut self, state: State) -> Result<StateID, BuildError> { + let id = StateID::new(self.states.len()) + .map_err(|_| BuildError::too_many_states(self.states.len()))?; + self.memory_states += state.memory_usage(); + self.states.push(state); + self.check_size_limit()?; + Ok(id) + } + + /// Add a transition from one state to another. + /// + /// This routine is called "patch" since it is very common to add the + /// states you want, typically with "dummy" state ID transitions, and then + /// "patch" in the real state IDs later. This is because you don't always + /// know all of the necessary state IDs to add because they might not + /// exist yet. + /// + /// # Errors + /// + /// This may error if patching leads to an increase in heap usage beyond + /// the configured size limit. Heap usage only grows when patching adds a + /// new transition (as in the case of a "union" state). + /// + /// # Panics + /// + /// This panics if `from` corresponds to a "sparse" state. When "sparse" + /// states are added, there is no way to patch them after-the-fact. (If you + /// have a use case where this would be helpful, please file an issue. It + /// will likely require a new API.) + pub fn patch( + &mut self, + from: StateID, + to: StateID, + ) -> Result<(), BuildError> { + let old_memory_states = self.memory_states; + match self.states[from] { + State::Empty { ref mut next } => { + *next = to; + } + State::ByteRange { ref mut trans } => { + trans.next = to; + } + State::Sparse { .. } => { + panic!("cannot patch from a sparse NFA state") + } + State::Look { ref mut next, .. } => { + *next = to; + } + State::Union { ref mut alternates } => { + alternates.push(to); + self.memory_states += mem::size_of::<StateID>(); + } + State::UnionReverse { ref mut alternates } => { + alternates.push(to); + self.memory_states += mem::size_of::<StateID>(); + } + State::CaptureStart { ref mut next, .. } => { + *next = to; + } + State::CaptureEnd { ref mut next, .. } => { + *next = to; + } + State::Fail => {} + State::Match { .. } => {} + } + if old_memory_states != self.memory_states { + self.check_size_limit()?; + } + Ok(()) + } + + /// Set whether the NFA produced by this builder should only match UTF-8. + /// + /// This should be set when both of the following are true: + /// + /// 1. The caller guarantees that the NFA created by this build will only + /// report non-empty matches with spans that are valid UTF-8. + /// 2. The caller desires regex engines using this NFA to avoid reporting + /// empty matches with a span that splits a valid UTF-8 encoded codepoint. + /// + /// Property (1) is not checked. Instead, this requires the caller to + /// promise that it is true. Property (2) corresponds to the behavior of + /// regex engines using the NFA created by this builder. Namely, there + /// is no way in the NFA's graph itself to say that empty matches found + /// by, for example, the regex `a*` will fall on valid UTF-8 boundaries. + /// Instead, this option is used to communicate the UTF-8 semantic to regex + /// engines that will typically implement it as a post-processing step by + /// filtering out empty matches that don't fall on UTF-8 boundaries. + /// + /// If you're building an NFA from an HIR (and not using a + /// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can + /// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) + /// option to guarantee that if the HIR detects a non-empty match, then it + /// is guaranteed to be valid UTF-8. + /// + /// Note that property (2) does *not* specify the behavior of executing + /// a search on a haystack that is not valid UTF-8. Therefore, if you're + /// *not* running this NFA on strings that are guaranteed to be valid + /// UTF-8, you almost certainly do not want to enable this option. + /// Similarly, if you are running the NFA on strings that *are* guaranteed + /// to be valid UTF-8, then you almost certainly want to enable this option + /// unless you can guarantee that your NFA will never produce a zero-width + /// match. + /// + /// It is disabled by default. + pub fn set_utf8(&mut self, yes: bool) { + self.utf8 = yes; + } + + /// Returns whether UTF-8 mode is enabled for this builder. + /// + /// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is. + pub fn get_utf8(&self) -> bool { + self.utf8 + } + + /// Sets whether the NFA produced by this builder should be matched in + /// reverse or not. Generally speaking, when enabled, the NFA produced + /// should be matched by moving backwards through a haystack, from a higher + /// memory address to a lower memory address. + /// + /// See also [`NFA::is_reverse`] for more details. + /// + /// This is disabled by default, which means NFAs are by default matched + /// in the forward direction. + pub fn set_reverse(&mut self, yes: bool) { + self.reverse = yes; + } + + /// Returns whether reverse mode is enabled for this builder. + /// + /// See [`Builder::set_reverse`] for more details about what "reverse mode" + /// is. + pub fn get_reverse(&self) -> bool { + self.reverse + } + + /// Sets the look-around matcher that should be used for the resulting NFA. + /// + /// A look-around matcher can be used to configure how look-around + /// assertions are matched. For example, a matcher might carry + /// configuration that changes the line terminator used for `(?m:^)` and + /// `(?m:$)` assertions. + pub fn set_look_matcher(&mut self, m: LookMatcher) { + self.look_matcher = m; + } + + /// Returns the look-around matcher used for this builder. + /// + /// If a matcher was not explicitly set, then `LookMatcher::default()` is + /// returned. + pub fn get_look_matcher(&self) -> &LookMatcher { + &self.look_matcher + } + + /// Set the size limit on this builder. + /// + /// Setting the size limit will also check whether the NFA built so far + /// fits within the given size limit. If it doesn't, then an error is + /// returned. + /// + /// By default, there is no configured size limit. + pub fn set_size_limit( + &mut self, + limit: Option<usize>, + ) -> Result<(), BuildError> { + self.size_limit = limit; + self.check_size_limit() + } + + /// Return the currently configured size limit. + /// + /// By default, this returns `None`, which corresponds to no configured + /// size limit. + pub fn get_size_limit(&self) -> Option<usize> { + self.size_limit + } + + /// Returns the heap memory usage, in bytes, used by the NFA states added + /// so far. + /// + /// Note that this is an approximation of how big the final NFA will be. + /// In practice, the final NFA will likely be a bit smaller because of + /// its simpler state representation. (For example, using things like + /// `Box<[StateID]>` instead of `Vec<StateID>`.) + pub fn memory_usage(&self) -> usize { + self.states.len() * mem::size_of::<State>() + self.memory_states + } + + fn check_size_limit(&self) -> Result<(), BuildError> { + if let Some(limit) = self.size_limit { + if self.memory_usage() > limit { + return Err(BuildError::exceeded_size_limit(limit)); + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // This asserts that a builder state doesn't have its size changed. It is + // *really* easy to accidentally increase the size, and thus potentially + // dramatically increase the memory usage of NFA builder. + // + // This assert doesn't mean we absolutely cannot increase the size of a + // builder state. We can. It's just here to make sure we do it knowingly + // and intentionally. + // + // A builder state is unfortunately a little bigger than an NFA state, + // since we really want to support adding things to a pre-existing state. + // i.e., We use Vec<thing> instead of Box<[thing]>. So we end up using an + // extra 8 bytes per state. Sad, but at least it gets freed once the NFA + // is built. + #[test] + fn state_has_small_size() { + #[cfg(target_pointer_width = "64")] + assert_eq!(32, core::mem::size_of::<State>()); + #[cfg(target_pointer_width = "32")] + assert_eq!(16, core::mem::size_of::<State>()); + } +} diff --git a/vendor/regex-automata/src/nfa/thompson/compiler.rs b/vendor/regex-automata/src/nfa/thompson/compiler.rs index 301194005..065e9ef27 100644 --- a/vendor/regex-automata/src/nfa/thompson/compiler.rs +++ b/vendor/regex-automata/src/nfa/thompson/compiler.rs @@ -1,73 +1,37 @@ -/* -This module provides an NFA compiler using Thompson's construction -algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA -graph as output. The NFA graph is structured in a way that permits it to be -executed by a virtual machine and also used to efficiently build a DFA. - -The compiler deals with a slightly expanded set of NFA states that notably -includes an empty node that has exactly one epsilon transition to the next -state. In other words, it's a "goto" instruction if one views Thompson's NFA -as a set of bytecode instructions. These goto instructions are removed in -a subsequent phase before returning the NFA to the caller. The purpose of -these empty nodes is that they make the construction algorithm substantially -simpler to implement. We remove them before returning to the caller because -they can represent substantial overhead when traversing the NFA graph -(either while searching using the NFA directly or while building a DFA). - -In the future, it would be nice to provide a Glushkov compiler as well, -as it would work well as a bit-parallel NFA for smaller regexes. But -the Thompson construction is one I'm more familiar with and seems more -straight-forward to deal with when it comes to large Unicode character -classes. - -Internally, the compiler uses interior mutability to improve composition -in the face of the borrow checker. In particular, we'd really like to be -able to write things like this: - - self.c_concat(exprs.iter().map(|e| self.c(e))) - -Which elegantly uses iterators to build up a sequence of compiled regex -sub-expressions and then hands it off to the concatenating compiler -routine. Without interior mutability, the borrow checker won't let us -borrow `self` mutably both inside and outside the closure at the same -time. -*/ - -use core::{ - borrow::Borrow, - cell::{Cell, RefCell}, - mem, -}; +use core::{borrow::Borrow, cell::RefCell}; use alloc::{sync::Arc, vec, vec::Vec}; use regex_syntax::{ - hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary}, + hir::{self, Hir}, utf8::{Utf8Range, Utf8Sequences}, ParserBuilder, }; use crate::{ nfa::thompson::{ - error::Error, + builder::Builder, + error::BuildError, + literal_trie::LiteralTrie, map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}, + nfa::{Transition, NFA}, range_trie::RangeTrie, - Look, SparseTransitions, State, Transition, NFA, }, util::{ - alphabet::ByteClassSet, - id::{IteratorIDExt, PatternID, StateID}, + look::{Look, LookMatcher}, + primitives::{PatternID, StateID}, }, }; -/// The configuration used for compiling a Thompson NFA from a regex pattern. -#[derive(Clone, Copy, Debug, Default)] +/// The configuration used for a Thompson NFA compiler. +#[derive(Clone, Debug, Default)] pub struct Config { - reverse: Option<bool>, utf8: Option<bool>, + reverse: Option<bool>, nfa_size_limit: Option<Option<usize>>, shrink: Option<bool>, - captures: Option<bool>, + which_captures: Option<WhichCaptures>, + look_matcher: Option<LookMatcher>, #[cfg(test)] unanchored_prefix: Option<bool>, } @@ -78,42 +42,162 @@ impl Config { Config::default() } + /// Whether to enable UTF-8 mode during search or not. + /// + /// A regex engine is said to be in UTF-8 mode when it guarantees that + /// all matches returned by it have spans consisting of only valid UTF-8. + /// That is, it is impossible for a match span to be returned that + /// contains any invalid UTF-8. + /// + /// UTF-8 mode generally consists of two things: + /// + /// 1. Whether the NFA's states are constructed such that all paths to a + /// match state that consume at least one byte always correspond to valid + /// UTF-8. + /// 2. Whether all paths to a match state that do _not_ consume any bytes + /// should always correspond to valid UTF-8 boundaries. + /// + /// (1) is a guarantee made by whoever constructs the NFA. + /// If you're parsing a regex from its concrete syntax, then + /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make + /// this guarantee for you. It does it by returning an error if the regex + /// pattern could every report a non-empty match span that contains invalid + /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex + /// successfully parses, then you're guaranteed that the corresponding NFA + /// will only ever report non-empty match spans containing valid UTF-8. + /// + /// (2) is a trickier guarantee because it cannot be enforced by the NFA + /// state graph itself. Consider, for example, the regex `a*`. It matches + /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where + /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint, + /// and thus correspond to invalid UTF-8 boundaries. Therefore, this + /// guarantee must be made at a higher level than the NFA state graph + /// itself. This crate deals with this case in each regex engine. Namely, + /// when a zero-width match that splits a codepoint is found and UTF-8 + /// mode enabled, then it is ignored and the engine moves on looking for + /// the next match. + /// + /// Thus, UTF-8 mode is both a promise that the NFA built only reports + /// non-empty matches that are valid UTF-8, and an *instruction* to regex + /// engines that empty matches that split codepoints should be banned. + /// + /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans, + /// it only makes sense to enable this option when you *know* your haystack + /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and + /// searching a haystack that contains invalid UTF-8 leads to **unspecified + /// behavior**. + /// + /// Therefore, it may make sense to enable `syntax::Config::utf8` while + /// simultaneously *disabling* this option. That would ensure all non-empty + /// match spans are valid UTF-8, but that empty match spans may still split + /// a codepoint or match at other places that aren't valid UTF-8. + /// + /// In general, this mode is only relevant if your regex can match the + /// empty string. Most regexes don't. + /// + /// This is enabled by default. + /// + /// # Example + /// + /// This example shows how UTF-8 mode can impact the match spans that may + /// be reported in certain cases. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// Match, Input, + /// }; + /// + /// let re = PikeVM::new("")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// // UTF-8 mode is enabled by default. + /// let mut input = Input::new("☃"); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); + /// + /// // Even though an empty regex matches at 1..1, our next match is + /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is + /// // three bytes long). + /// input.set_start(1); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); + /// + /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build("")?; + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); + /// + /// input.set_start(2); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); + /// + /// input.set_start(3); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); + /// + /// input.set_start(4); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn utf8(mut self, yes: bool) -> Config { + self.utf8 = Some(yes); + self + } + /// Reverse the NFA. /// /// A NFA reversal is performed by reversing all of the concatenated - /// sub-expressions in the original pattern, recursively. The resulting - /// NFA can be used to match the pattern starting from the end of a string - /// instead of the beginning of a string. + /// sub-expressions in the original pattern, recursively. (Look around + /// operators are also inverted.) The resulting NFA can be used to match + /// the pattern starting from the end of a string instead of the beginning + /// of a string. /// /// Reversing the NFA is useful for building a reverse DFA, which is most /// useful for finding the start of a match after its ending position has - /// been found. + /// been found. NFA execution engines typically do not work on reverse + /// NFAs. For example, currently, the Pike VM reports the starting location + /// of matches without a reverse NFA. + /// + /// Currently, enabling this setting requires disabling the + /// [`captures`](Config::captures) setting. If both are enabled, then the + /// compiler will return an error. It is expected that this limitation will + /// be lifted in the future. /// /// This is disabled by default. - pub fn reverse(mut self, yes: bool) -> Config { - self.reverse = Some(yes); - self - } - - /// Whether to enable UTF-8 mode or not. /// - /// When UTF-8 mode is enabled (which is the default), unanchored searches - /// will only match through valid UTF-8. If invalid UTF-8 is seen, then - /// an unanchored search will stop at that point. This is equivalent to - /// putting a `(?s:.)*?` at the start of the regex. + /// # Example /// - /// When UTF-8 mode is disabled, then unanchored searches will match - /// through any arbitrary byte. This is equivalent to putting a - /// `(?s-u:.)*?` at the start of the regex. + /// This example shows how to build a DFA from a reverse NFA, and then use + /// the DFA to search backwards. /// - /// Generally speaking, UTF-8 mode should only be used when you know you - /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used - /// on input that is not valid UTF-8, then the regex is not likely to work - /// as expected. + /// ``` + /// use regex_automata::{ + /// dfa::{self, Automaton}, + /// nfa::thompson::{NFA, WhichCaptures}, + /// HalfMatch, Input, + /// }; /// - /// This is enabled by default. - pub fn utf8(mut self, yes: bool) -> Config { - self.utf8 = Some(yes); + /// let dfa = dfa::dense::Builder::new() + /// .thompson(NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true) + /// ) + /// .build("baz[0-9]+")?; + /// let expected = Some(HalfMatch::must(0, 3)); + /// assert_eq!( + /// expected, + /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?, + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reverse(mut self, yes: bool) -> Config { + self.reverse = Some(yes); self } @@ -143,16 +227,17 @@ impl Config { /// size of the NFA. /// /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::NFA; /// /// // 300KB isn't enough! - /// NFA::builder() + /// NFA::compiler() /// .configure(NFA::config().nfa_size_limit(Some(300_000))) /// .build(r"\w{20}") /// .unwrap_err(); /// /// // ... but 400KB probably is. - /// let nfa = NFA::builder() + /// let nfa = NFA::compiler() /// .configure(NFA::config().nfa_size_limit(Some(400_000))) /// .build(r"\w{20}")?; /// @@ -168,17 +253,52 @@ impl Config { /// Apply best effort heuristics to shrink the NFA at the expense of more /// time/memory. /// - /// This is enabled by default. Generally speaking, if one is using an NFA - /// to compile a DFA, then the extra time used to shrink the NFA will be - /// more than made up for during DFA construction (potentially by a lot). - /// In other words, enabling this can substantially decrease the overall - /// amount of time it takes to build a DFA. + /// Generally speaking, if one is using an NFA to compile a DFA, then the + /// extra time used to shrink the NFA will be more than made up for during + /// DFA construction (potentially by a lot). In other words, enabling this + /// can substantially decrease the overall amount of time it takes to build + /// a DFA. /// - /// The only reason to disable this if you want to compile an NFA and start - /// using it as quickly as possible without needing to build a DFA. e.g., - /// for an NFA simulation or for a lazy DFA. + /// A reason to keep this disabled is if you want to compile an NFA and + /// start using it as quickly as possible without needing to build a DFA, + /// and you don't mind using a bit of extra memory for the NFA. e.g., for + /// an NFA simulation or for a lazy DFA. /// - /// This is enabled by default. + /// NFA shrinking is currently most useful when compiling a reverse + /// NFA with large Unicode character classes. In particular, it trades + /// additional CPU time during NFA compilation in favor of generating fewer + /// NFA states. + /// + /// This is disabled by default because it can increase compile times + /// quite a bit if you aren't building a full DFA. + /// + /// # Example + /// + /// This example shows that NFA shrinking can lead to substantial space + /// savings in some cases. Notice that, as noted above, we build a reverse + /// DFA and use a pattern with a large Unicode character class. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; + /// + /// // Currently we have to disable captures when enabling reverse NFA. + /// let config = NFA::config() + /// .which_captures(WhichCaptures::None) + /// .reverse(true); + /// let not_shrunk = NFA::compiler() + /// .configure(config.clone().shrink(false)) + /// .build(r"\w")?; + /// let shrunk = NFA::compiler() + /// .configure(config.clone().shrink(true)) + /// .build(r"\w")?; + /// + /// // While a specific shrink factor is not guaranteed, the savings can be + /// // considerable in some cases. + /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn shrink(mut self, yes: bool) -> Config { self.shrink = Some(yes); self @@ -186,13 +306,153 @@ impl Config { /// Whether to include 'Capture' states in the NFA. /// - /// This can only be enabled when compiling a forward NFA. This is - /// always disabled---with no way to override it---when the `reverse` - /// configuration is enabled. + /// Currently, enabling this setting requires disabling the + /// [`reverse`](Config::reverse) setting. If both are enabled, then the + /// compiler will return an error. It is expected that this limitation will + /// be lifted in the future. /// /// This is enabled by default. - pub fn captures(mut self, yes: bool) -> Config { - self.captures = Some(yes); + /// + /// # Example + /// + /// This example demonstrates that some regex engines, like the Pike VM, + /// require capturing states to be present in the NFA to report match + /// offsets. + /// + /// (Note that since this method is deprecated, the example below uses + /// [`Config::which_captures`] to disable capture states.) + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[deprecated(since = "0.3.5", note = "use which_captures instead")] + pub fn captures(self, yes: bool) -> Config { + self.which_captures(if yes { + WhichCaptures::All + } else { + WhichCaptures::None + }) + } + + /// Configures what kinds of capture groups are compiled into + /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a + /// Thompson NFA. + /// + /// Currently, using any option except for [`WhichCaptures::None`] requires + /// disabling the [`reverse`](Config::reverse) setting. If both are + /// enabled, then the compiler will return an error. It is expected that + /// this limitation will be lifted in the future. + /// + /// This is set to [`WhichCaptures::All`] by default. Callers may wish to + /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the + /// overhead of capture states for explicit groups. Usually this occurs + /// when one wants to use the `PikeVM` only for determining the overall + /// match. Otherwise, the `PikeVM` could use much more memory than is + /// necessary. + /// + /// # Example + /// + /// This example demonstrates that some regex engines, like the Pike VM, + /// require capturing states to be present in the NFA to report match + /// offsets. + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// pikevm::PikeVM, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "abc")); + /// assert_eq!(None, re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// The same applies to the bounded backtracker: + /// + /// ``` + /// use regex_automata::nfa::thompson::{ + /// backtrack::BoundedBacktracker, + /// NFA, + /// WhichCaptures, + /// }; + /// + /// let re = BoundedBacktracker::builder() + /// .thompson(NFA::config().which_captures(WhichCaptures::None)) + /// .build(r"[a-z]+")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.try_is_match(&mut cache, "abc")?); + /// assert_eq!(None, re.try_find(&mut cache, "abc")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { + self.which_captures = Some(which_captures); + self + } + + /// Sets the look-around matcher that should be used with this NFA. + /// + /// A look-around matcher determines how to match look-around assertions. + /// In particular, some assertions are configurable. For example, the + /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed + /// from the default of `\n` to any other byte. + /// + /// # Example + /// + /// This shows how to change the line terminator for multi-line assertions. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// util::look::LookMatcher, + /// Match, Input, + /// }; + /// + /// let mut lookm = LookMatcher::new(); + /// lookm.set_line_terminator(b'\x00'); + /// + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().look_matcher(lookm)) + /// .build(r"(?m)^[a-z]+$")?; + /// let mut cache = re.create_cache(); + /// + /// // Multi-line assertions now use NUL as a terminator. + /// assert_eq!( + /// Some(Match::must(0, 1..4)), + /// re.find(&mut cache, b"\x00abc\x00"), + /// ); + /// // ... and \n is no longer recognized as a terminator. + /// assert_eq!( + /// None, + /// re.find(&mut cache, b"\nabc\n"), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn look_matcher(mut self, m: LookMatcher) -> Config { + self.look_matcher = Some(m); self } @@ -206,26 +466,47 @@ impl Config { self } - pub fn get_reverse(&self) -> bool { - self.reverse.unwrap_or(false) - } - + /// Returns whether this configuration has enabled UTF-8 mode. pub fn get_utf8(&self) -> bool { self.utf8.unwrap_or(true) } + /// Returns whether this configuration has enabled reverse NFA compilation. + pub fn get_reverse(&self) -> bool { + self.reverse.unwrap_or(false) + } + + /// Return the configured NFA size limit, if it exists, in the number of + /// bytes of heap used. pub fn get_nfa_size_limit(&self) -> Option<usize> { self.nfa_size_limit.unwrap_or(None) } + /// Return whether NFA shrinking is enabled. pub fn get_shrink(&self) -> bool { - self.shrink.unwrap_or(true) + self.shrink.unwrap_or(false) } + /// Return whether NFA compilation is configured to produce capture states. + #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] pub fn get_captures(&self) -> bool { - !self.get_reverse() && self.captures.unwrap_or(true) + self.get_which_captures().is_any() + } + + /// Return what kinds of capture states will be compiled into an NFA. + pub fn get_which_captures(&self) -> WhichCaptures { + self.which_captures.unwrap_or(WhichCaptures::All) + } + + /// Return the look-around matcher for this NFA. + pub fn get_look_matcher(&self) -> LookMatcher { + self.look_matcher.clone().unwrap_or(LookMatcher::default()) } + /// Return whether NFA compilation is configured to include an unanchored + /// prefix. + /// + /// This is always false when not in test mode. fn get_unanchored_prefix(&self) -> bool { #[cfg(test)] { @@ -237,56 +518,283 @@ impl Config { } } - pub(crate) fn overwrite(self, o: Config) -> Config { + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { Config { - reverse: o.reverse.or(self.reverse), utf8: o.utf8.or(self.utf8), + reverse: o.reverse.or(self.reverse), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), shrink: o.shrink.or(self.shrink), - captures: o.captures.or(self.captures), + which_captures: o.which_captures.or(self.which_captures), + look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), #[cfg(test)] unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), } } } -/// A builder for compiling an NFA. +/// A configuration indicating which kinds of +/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. +/// +/// This configuration can be used with [`Config::which_captures`] to control +/// which capture states are compiled into a Thompson NFA. +/// +/// The default configuration is [`WhichCaptures::All`]. +#[derive(Clone, Copy, Debug)] +pub enum WhichCaptures { + /// All capture states, including those corresponding to both implicit and + /// explicit capture groups, are included in the Thompson NFA. + All, + /// Only capture states corresponding to implicit capture groups are + /// included. Implicit capture groups appear in every pattern implicitly + /// and correspond to the overall match of a pattern. + /// + /// This is useful when one only cares about the overall match of a + /// pattern. By excluding capture states from explicit capture groups, + /// one might be able to reduce the memory usage of a multi-pattern regex + /// substantially if it was otherwise written to have many explicit capture + /// groups. + Implicit, + /// No capture states are compiled into the Thompson NFA. + /// + /// This is useful when capture states are either not needed (for example, + /// if one is only trying to build a DFA) or if they aren't supported (for + /// example, a reverse NFA). + None, +} + +impl Default for WhichCaptures { + fn default() -> WhichCaptures { + WhichCaptures::All + } +} + +impl WhichCaptures { + /// Returns true if this configuration indicates that no capture states + /// should be produced in an NFA. + pub fn is_none(&self) -> bool { + matches!(*self, WhichCaptures::None) + } + + /// Returns true if this configuration indicates that some capture states + /// should be added to an NFA. Note that this might only include capture + /// states for implicit capture groups. + pub fn is_any(&self) -> bool { + !self.is_none() + } +} + +/* +This compiler below uses Thompson's construction algorithm. The compiler takes +a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph +is structured in a way that permits it to be executed by a virtual machine and +also used to efficiently build a DFA. + +The compiler deals with a slightly expanded set of NFA states than what is +in a final NFA (as exhibited by builder::State and nfa::State). Notably a +compiler state includes an empty node that has exactly one unconditional +epsilon transition to the next state. In other words, it's a "goto" instruction +if one views Thompson's NFA as a set of bytecode instructions. These goto +instructions are removed in a subsequent phase before returning the NFA to the +caller. The purpose of these empty nodes is that they make the construction +algorithm substantially simpler to implement. We remove them before returning +to the caller because they can represent substantial overhead when traversing +the NFA graph (either while searching using the NFA directly or while building +a DFA). + +In the future, it would be nice to provide a Glushkov compiler as well, as it +would work well as a bit-parallel NFA for smaller regexes. But the Thompson +construction is one I'm more familiar with and seems more straight-forward to +deal with when it comes to large Unicode character classes. + +Internally, the compiler uses interior mutability to improve composition in the +face of the borrow checker. In particular, we'd really like to be able to write +things like this: + + self.c_concat(exprs.iter().map(|e| self.c(e))) + +Which elegantly uses iterators to build up a sequence of compiled regex +sub-expressions and then hands it off to the concatenating compiler routine. +Without interior mutability, the borrow checker won't let us borrow `self` +mutably both inside and outside the closure at the same time. +*/ + +/// A builder for compiling an NFA from a regex's high-level intermediate +/// representation (HIR). +/// +/// This compiler provides a way to translate a parsed regex pattern into an +/// NFA state graph. The NFA state graph can either be used directly to execute +/// a search (e.g., with a Pike VM), or it can be further used to build a DFA. +/// +/// This compiler provides APIs both for compiling regex patterns directly from +/// their concrete syntax, or via a [`regex_syntax::hir::Hir`]. +/// +/// This compiler has various options that may be configured via +/// [`thompson::Config`](Config). +/// +/// Note that a compiler is not the same as a [`thompson::Builder`](Builder). +/// A `Builder` provides a lower level API that is uncoupled from a regex +/// pattern's concrete syntax or even its HIR. Instead, it permits stitching +/// together an NFA by hand. See its docs for examples. +/// +/// # Example: compilation from concrete syntax +/// +/// This shows how to compile an NFA from a pattern string while setting a size +/// limit on how big the NFA is allowed to be (in terms of bytes of heap used). +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{NFA, pikevm::PikeVM}, +/// Match, +/// }; +/// +/// let config = NFA::config().nfa_size_limit(Some(1_000)); +/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; +/// +/// let re = PikeVM::new_from_nfa(nfa)?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// let expected = Some(Match::must(0, 3..4)); +/// re.captures(&mut cache, "!@#A#@!", &mut caps); +/// assert_eq!(expected, caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: compilation from HIR +/// +/// This shows how to hand assemble a regular expression via its HIR, and then +/// compile an NFA directly from it. +/// +/// ``` +/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; +/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; +/// +/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ +/// ClassBytesRange::new(b'0', b'9'), +/// ClassBytesRange::new(b'A', b'Z'), +/// ClassBytesRange::new(b'_', b'_'), +/// ClassBytesRange::new(b'a', b'z'), +/// ]))); +/// +/// let config = NFA::config().nfa_size_limit(Some(1_000)); +/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; +/// +/// let re = PikeVM::new_from_nfa(nfa)?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// let expected = Some(Match::must(0, 3..4)); +/// re.captures(&mut cache, "!@#A#@!", &mut caps); +/// assert_eq!(expected, caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` #[derive(Clone, Debug)] -pub struct Builder { - config: Config, +pub struct Compiler { + /// A regex parser, used when compiling an NFA directly from a pattern + /// string. parser: ParserBuilder, + /// The compiler configuration. + config: Config, + /// The builder for actually constructing an NFA. This provides a + /// convenient abstraction for writing a compiler. + builder: RefCell<Builder>, + /// State used for compiling character classes to UTF-8 byte automata. + /// State is not retained between character class compilations. This just + /// serves to amortize allocation to the extent possible. + utf8_state: RefCell<Utf8State>, + /// State used for arranging character classes in reverse into a trie. + trie_state: RefCell<RangeTrie>, + /// State used for caching common suffixes when compiling reverse UTF-8 + /// automata (for Unicode character classes). + utf8_suffix: RefCell<Utf8SuffixMap>, } -impl Builder { +impl Compiler { /// Create a new NFA builder with its default configuration. - pub fn new() -> Builder { - Builder { config: Config::default(), parser: ParserBuilder::new() } + pub fn new() -> Compiler { + Compiler { + parser: ParserBuilder::new(), + config: Config::default(), + builder: RefCell::new(Builder::new()), + utf8_state: RefCell::new(Utf8State::new()), + trie_state: RefCell::new(RangeTrie::new()), + utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), + } } - /// Compile the given regular expression into an NFA. + /// Compile the given regular expression pattern into an NFA. /// /// If there was a problem parsing the regex, then that error is returned. /// /// Otherwise, if there was a problem building the NFA, then an error is /// returned. The only error that can occur is if the compiled regex would - /// exceed the size limits configured on this builder. - pub fn build(&self, pattern: &str) -> Result<NFA, Error> { + /// exceed the size limits configured on this builder, or if any part of + /// the NFA would exceed the integer representations used. (For example, + /// too many states might plausibly occur on a 16-bit target.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// let expected = Some(Match::must(0, 3..4)); + /// re.captures(&mut cache, "!@#A#@!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> { self.build_many(&[pattern]) } + /// Compile the given regular expression patterns into a single NFA. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_many(&[ + /// r"(?-u)\s", + /// r"(?-u)\w", + /// ])?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// let expected = Some(Match::must(1, 1..2)); + /// re.captures(&mut cache, "!A! !A!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn build_many<P: AsRef<str>>( &self, patterns: &[P], - ) -> Result<NFA, Error> { + ) -> Result<NFA, BuildError> { let mut hirs = vec![]; for p in patterns { hirs.push( self.parser .build() .parse(p.as_ref()) - .map_err(Error::syntax)?, + .map_err(BuildError::syntax)?, ); - log!(log::trace!("parsed: {:?}", p.as_ref())); + debug!("parsed: {:?}", p.as_ref()); } self.build_many_from_hir(&hirs) } @@ -296,418 +804,219 @@ impl Builder { /// /// If there was a problem building the NFA, then an error is returned. The /// only error that can occur is if the compiled regex would exceed the - /// size limits configured on this builder. - pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> { - self.build_from_hir_with(&mut Compiler::new(), expr) + /// size limits configured on this builder, or if any part of the NFA would + /// exceed the integer representations used. (For example, too many states + /// might plausibly occur on a 16-bit target.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// let expected = Some(Match::must(0, 3..4)); + /// re.captures(&mut cache, "!@#A#@!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> { + self.build_many_from_hir(&[expr]) } + /// Compile the given high level intermediate representations of regular + /// expressions into a single NFA. + /// + /// When matches are returned, the pattern ID corresponds to the index of + /// the pattern in the slice given. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hirs = &[ + /// Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'\t', b'\r'), + /// ClassBytesRange::new(b' ', b' '), + /// ]))), + /// Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))), + /// ]; + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let mut cache = re.create_cache(); + /// let mut caps = re.create_captures(); + /// let expected = Some(Match::must(1, 1..2)); + /// re.captures(&mut cache, "!A! !A!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn build_many_from_hir<H: Borrow<Hir>>( &self, exprs: &[H], - ) -> Result<NFA, Error> { - self.build_many_from_hir_with(&mut Compiler::new(), exprs) - } - - /// Compile the given high level intermediate representation of a regular - /// expression into the NFA given using the given compiler. Callers may - /// prefer this over `build` if they would like to reuse allocations while - /// compiling many regular expressions. - /// - /// On success, the given NFA is completely overwritten with the NFA - /// produced by the compiler. - /// - /// If there was a problem building the NFA, then an error is returned. - /// The only error that can occur is if the compiled regex would exceed - /// the size limits configured on this builder. When an error is returned, - /// the contents of `nfa` are unspecified and should not be relied upon. - /// However, it can still be reused in subsequent calls to this method. - fn build_from_hir_with( - &self, - compiler: &mut Compiler, - expr: &Hir, - ) -> Result<NFA, Error> { - self.build_many_from_hir_with(compiler, &[expr]) - } - - fn build_many_from_hir_with<H: Borrow<Hir>>( - &self, - compiler: &mut Compiler, - exprs: &[H], - ) -> Result<NFA, Error> { - compiler.configure(self.config); - compiler.compile(exprs) + ) -> Result<NFA, BuildError> { + self.compile(exprs) } /// Apply the given NFA configuration options to this builder. - pub fn configure(&mut self, config: Config) -> &mut Builder { + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; + /// assert_eq!(nfa.pattern_len(), 1); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn configure(&mut self, config: Config) -> &mut Compiler { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using - /// [`SyntaxConfig`](../../struct.SyntaxConfig.html). + /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// - /// This syntax configuration generally only applies when an NFA is built - /// directly from a pattern string. If an NFA is built from an HIR, then - /// all syntax settings are ignored. + /// This syntax configuration only applies when an NFA is built directly + /// from a pattern string. If an NFA is built from an HIR, then all syntax + /// settings are ignored. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::syntax}; + /// + /// let syntax_config = syntax::Config::new().unicode(false); + /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?; + /// // If Unicode were enabled, the number of states would be much bigger. + /// assert!(nfa.states().len() < 15); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn syntax( &mut self, - config: crate::util::syntax::SyntaxConfig, - ) -> &mut Builder { + config: crate::util::syntax::Config, + ) -> &mut Compiler { config.apply(&mut self.parser); self } } -/// A compiler that converts a regex abstract syntax to an NFA via Thompson's -/// construction. Namely, this compiler permits epsilon transitions between -/// states. -#[derive(Clone, Debug)] -pub struct Compiler { - /// The configuration from the builder. - config: Config, - /// The final NFA that is built. - /// - /// Parts of this NFA are constructed during compilation, but the actual - /// states aren't added until a final "finish" step. This is because the - /// states constructed during compilation have unconditional epsilon - /// transitions, which makes the logic of compilation much simpler. The - /// "finish" step removes these unconditional epsilon transitions and must - /// therefore remap all of the transition state IDs. - nfa: RefCell<NFA>, - /// The set of compiled NFA states. Once a state is compiled, it is - /// assigned a state ID equivalent to its index in this list. Subsequent - /// compilation can modify previous states by adding new transitions. - states: RefCell<Vec<CState>>, - /// State used for compiling character classes to UTF-8 byte automata. - /// State is not retained between character class compilations. This just - /// serves to amortize allocation to the extent possible. - utf8_state: RefCell<Utf8State>, - /// State used for arranging character classes in reverse into a trie. - trie_state: RefCell<RangeTrie>, - /// State used for caching common suffixes when compiling reverse UTF-8 - /// automata (for Unicode character classes). - utf8_suffix: RefCell<Utf8SuffixMap>, - /// A map used to re-map state IDs when translating the compiler's internal - /// NFA state representation to the external NFA representation. - remap: RefCell<Vec<StateID>>, - /// A set of compiler internal state IDs that correspond to states that are - /// exclusively epsilon transitions, i.e., goto instructions, combined with - /// the state that they point to. This is used to record said states while - /// transforming the compiler's internal NFA representation to the external - /// form. - empties: RefCell<Vec<(StateID, StateID)>>, - /// The total memory used by each of the 'CState's in 'states'. This only - /// includes heap usage by each state, and not the size of the state - /// itself. - memory_cstates: Cell<usize>, -} - -/// A compiler intermediate state representation for an NFA that is only used -/// during compilation. Once compilation is done, `CState`s are converted -/// to `State`s (defined in the parent module), which have a much simpler -/// representation. -#[derive(Clone, Debug, Eq, PartialEq)] -enum CState { - /// An empty state whose only purpose is to forward the automaton to - /// another state via en epsilon transition. These are useful during - /// compilation but are otherwise removed at the end. - Empty { - next: StateID, - }, - /// An empty state that records a capture location. - /// - /// From the perspective of finite automata, this is precisely equivalent - /// to 'Empty', but serves the purpose of instructing NFA simulations to - /// record additional state when the finite state machine passes through - /// this epsilon transition. - /// - /// These transitions are treated as epsilon transitions with no additional - /// effects in DFAs. - /// - /// 'slot' in this context refers to the specific capture group offset that - /// is being recorded. Each capturing group has two slots corresponding to - /// the start and end of the matching portion of that group. - CaptureStart { - next: StateID, - capture_index: u32, - name: Option<Arc<str>>, - }, - CaptureEnd { - next: StateID, - capture_index: u32, - }, - /// A state that only transitions to `next` if the current input byte is - /// in the range `[start, end]` (inclusive on both ends). - Range { - range: Transition, - }, - /// A state with possibly many transitions, represented in a sparse - /// fashion. Transitions are ordered lexicographically by input range. - /// As such, this may only be used when every transition has equal - /// priority. (In practice, this is only used for encoding large UTF-8 - /// automata.) In contrast, a `Union` state has each alternate in order - /// of priority. Priority is used to implement greedy matching and also - /// alternations themselves, e.g., `abc|a` where `abc` has priority over - /// `a`. - /// - /// To clarify, it is possible to remove `Sparse` and represent all things - /// that `Sparse` is used for via `Union`. But this creates a more bloated - /// NFA with more epsilon transitions than is necessary in the special case - /// of character classes. - Sparse { - ranges: Vec<Transition>, - }, - /// A conditional epsilon transition satisfied via some sort of - /// look-around. - Look { - look: Look, - next: StateID, - }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via earlier transitions - /// are preferred over later transitions. - Union { - alternates: Vec<StateID>, - }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via later transitions are - /// preferred over earlier transitions. - /// - /// This "reverse" state exists for convenience during compilation that - /// permits easy construction of non-greedy combinations of NFA states. At - /// the end of compilation, Union and UnionReverse states are merged into - /// one Union type of state, where the latter has its epsilon transitions - /// reversed to reflect the priority inversion. - /// - /// The "convenience" here arises from the fact that as new states are - /// added to the list of `alternates`, we would like that add operation - /// to be amortized constant time. But if we used a `Union`, we'd need to - /// prepend the state, which takes O(n) time. There are other approaches we - /// could use to solve this, but this seems simple enough. - UnionReverse { - alternates: Vec<StateID>, - }, - /// A match state. There is at most one such occurrence of this state in - /// an NFA for each pattern compiled into the NFA. At time of writing, a - /// match state is always produced for every pattern given, but in theory, - /// if a pattern can never lead to a match, then the match state could be - /// omitted. - /// - /// `id` refers to the ID of the pattern itself, which corresponds to the - /// pattern's index (starting at 0). `start_id` refers to the anchored - /// NFA starting state corresponding to this pattern. - Match { - pattern_id: PatternID, - start_id: StateID, - }, -} - -/// A value that represents the result of compiling a sub-expression of a -/// regex's HIR. Specifically, this represents a sub-graph of the NFA that -/// has an initial state at `start` and a final state at `end`. -#[derive(Clone, Copy, Debug)] -pub struct ThompsonRef { - start: StateID, - end: StateID, -} - impl Compiler { - /// Create a new compiler. - pub fn new() -> Compiler { - Compiler { - config: Config::default(), - nfa: RefCell::new(NFA::empty()), - states: RefCell::new(vec![]), - utf8_state: RefCell::new(Utf8State::new()), - trie_state: RefCell::new(RangeTrie::new()), - utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), - remap: RefCell::new(vec![]), - empties: RefCell::new(vec![]), - memory_cstates: Cell::new(0), - } - } - - /// Configure and prepare this compiler from the builder's knobs. + /// Compile the sequence of HIR expressions given. Pattern IDs are + /// allocated starting from 0, in correspondence with the slice given. /// - /// The compiler is must always reconfigured by the builder before using it - /// to build an NFA. Namely, this will also clear any latent state in the - /// compiler used during previous compilations. - fn configure(&mut self, config: Config) { - self.config = config; - self.nfa.borrow_mut().clear(); - self.states.borrow_mut().clear(); - self.memory_cstates.set(0); - // We don't need to clear anything else since they are cleared on - // their own and only when they are used. - } - - /// Convert the current intermediate NFA to its final compiled form. - fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> { - if exprs.is_empty() { - return Ok(NFA::never_match()); - } + /// It is legal to provide an empty slice. In that case, the NFA returned + /// has no patterns and will never match anything. + fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> { if exprs.len() > PatternID::LIMIT { - return Err(Error::too_many_patterns(exprs.len())); + return Err(BuildError::too_many_patterns(exprs.len())); + } + if self.config.get_reverse() + && self.config.get_which_captures().is_any() + { + return Err(BuildError::unsupported_captures()); } + self.builder.borrow_mut().clear(); + self.builder.borrow_mut().set_utf8(self.config.get_utf8()); + self.builder.borrow_mut().set_reverse(self.config.get_reverse()); + self.builder + .borrow_mut() + .set_look_matcher(self.config.get_look_matcher()); + self.builder + .borrow_mut() + .set_size_limit(self.config.get_nfa_size_limit())?; + // We always add an unanchored prefix unless we were specifically told // not to (for tests only), or if we know that the regex is anchored // for all matches. When an unanchored prefix is not added, then the // NFA's anchored and unanchored start states are equivalent. - let all_anchored = - exprs.iter().all(|e| e.borrow().is_anchored_start()); + let all_anchored = exprs.iter().all(|e| { + e.borrow() + .properties() + .look_set_prefix() + .contains(hir::Look::Start) + }); let anchored = !self.config.get_unanchored_prefix() || all_anchored; let unanchored_prefix = if anchored { self.c_empty()? } else { - if self.config.get_utf8() { - self.c_unanchored_prefix_valid_utf8()? - } else { - self.c_unanchored_prefix_invalid_utf8()? - } + self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)? }; - let compiled = self.c_alternation( - exprs.iter().with_pattern_ids().map(|(pid, e)| { - let group_kind = hir::GroupKind::CaptureIndex(0); - let one = self.c_group(&group_kind, e.borrow())?; - let match_state_id = self.add_match(pid, one.start)?; - self.patch(one.end, match_state_id)?; - Ok(ThompsonRef { start: one.start, end: match_state_id }) - }), - )?; + let compiled = self.c_alt_iter(exprs.iter().map(|e| { + let _ = self.start_pattern()?; + let one = self.c_cap(0, None, e.borrow())?; + let match_state_id = self.add_match()?; + self.patch(one.end, match_state_id)?; + let _ = self.finish_pattern(one.start)?; + Ok(ThompsonRef { start: one.start, end: match_state_id }) + }))?; self.patch(unanchored_prefix.end, compiled.start)?; - self.finish(compiled.start, unanchored_prefix.start)?; - Ok(self.nfa.replace(NFA::empty())) - } + let nfa = self + .builder + .borrow_mut() + .build(compiled.start, unanchored_prefix.start)?; - /// Finishes the compilation process and populates the NFA attached to this - /// compiler with the final graph. - fn finish( - &self, - start_anchored: StateID, - start_unanchored: StateID, - ) -> Result<(), Error> { - trace!( - "intermediate NFA compilation complete, \ - intermediate NFA size: {} states, {} bytes on heap", - self.states.borrow().len(), - self.nfa_memory_usage(), - ); - let mut nfa = self.nfa.borrow_mut(); - let mut bstates = self.states.borrow_mut(); - let mut remap = self.remap.borrow_mut(); - let mut empties = self.empties.borrow_mut(); - remap.resize(bstates.len(), StateID::ZERO); - empties.clear(); - - // The idea here is to convert our intermediate states to their final - // form. The only real complexity here is the process of converting - // transitions, which are expressed in terms of state IDs. The new - // set of states will be smaller because of partial epsilon removal, - // so the state IDs will not be the same. - for (sid, bstate) in bstates.iter_mut().with_state_ids() { - match *bstate { - CState::Empty { next } => { - // Since we're removing empty states, we need to handle - // them later since we don't yet know which new state this - // empty state will be mapped to. - empties.push((sid, next)); - } - CState::CaptureStart { next, capture_index, ref name } => { - // We can't remove this empty state because of the side - // effect of capturing an offset for this capture slot. - remap[sid] = nfa.add_capture_start( - next, - capture_index, - name.clone(), - )?; - } - CState::CaptureEnd { next, capture_index } => { - // We can't remove this empty state because of the side - // effect of capturing an offset for this capture slot. - remap[sid] = nfa.add_capture_end(next, capture_index)?; - } - CState::Range { range } => { - remap[sid] = nfa.add_range(range)?; - } - CState::Sparse { ref mut ranges } => { - let ranges = - mem::replace(ranges, vec![]).into_boxed_slice(); - remap[sid] = - nfa.add_sparse(SparseTransitions { ranges })?; - } - CState::Look { look, next } => { - remap[sid] = nfa.add_look(next, look)?; - } - CState::Union { ref mut alternates } => { - let alternates = - mem::replace(alternates, vec![]).into_boxed_slice(); - remap[sid] = nfa.add_union(alternates)?; - } - CState::UnionReverse { ref mut alternates } => { - let mut alternates = - mem::replace(alternates, vec![]).into_boxed_slice(); - alternates.reverse(); - remap[sid] = nfa.add_union(alternates)?; - } - CState::Match { start_id, .. } => { - remap[sid] = nfa.add_match()?; - nfa.finish_pattern(start_id)?; - } - } - } - for &(empty_id, mut empty_next) in empties.iter() { - // empty states can point to other empty states, forming a chain. - // So we must follow the chain until the end, which must end at - // a non-empty state, and therefore, a state that is correctly - // remapped. We are guaranteed to terminate because our compiler - // never builds a loop among only empty states. - while let CState::Empty { next } = bstates[empty_next] { - empty_next = next; - } - remap[empty_id] = remap[empty_next]; - } - nfa.set_start_anchored(start_anchored); - nfa.set_start_unanchored(start_unanchored); - nfa.remap(&remap); - trace!( - "final NFA (reverse? {:?}) compilation complete, \ - final NFA size: {} states, {} bytes on heap", - self.config.get_reverse(), - nfa.states().len(), - nfa.memory_usage(), - ); - Ok(()) + debug!("HIR-to-NFA compilation complete, config: {:?}", self.config); + Ok(nfa) } - fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> { + /// Compile an arbitrary HIR expression. + fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> { + use regex_syntax::hir::{Class, HirKind::*}; + match *expr.kind() { - HirKind::Empty => self.c_empty(), - HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch), - HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b), - HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c), - HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c), - HirKind::Anchor(ref anchor) => self.c_anchor(anchor), - HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb), - HirKind::Repetition(ref rep) => self.c_repetition(rep), - HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir), - HirKind::Concat(ref es) => { - self.c_concat(es.iter().map(|e| self.c(e))) - } - HirKind::Alternation(ref es) => { - self.c_alternation(es.iter().map(|e| self.c(e))) - } + Empty => self.c_empty(), + Literal(hir::Literal(ref bytes)) => self.c_literal(bytes), + Class(Class::Bytes(ref c)) => self.c_byte_class(c), + Class(Class::Unicode(ref c)) => self.c_unicode_class(c), + Look(ref look) => self.c_look(look), + Repetition(ref rep) => self.c_repetition(rep), + Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), + Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), + Alternation(ref es) => self.c_alt_slice(es), } } - fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> + /// Compile a concatenation of the sub-expressions yielded by the given + /// iterator. If the iterator yields no elements, then this compiles down + /// to an "empty" state that always matches. + /// + /// If the compiler is in reverse mode, then the expressions given are + /// automatically compiled in reverse. + fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> where - I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>, + I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>, { let first = if self.is_reverse() { it.next_back() } else { it.next() }; let ThompsonRef { start, mut end } = match first { @@ -727,11 +1036,57 @@ impl Compiler { Ok(ThompsonRef { start, end }) } - fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error> + /// Compile an alternation of the given HIR values. + /// + /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead + /// of an iterator of compiled NFA subgraphs. The point of accepting a + /// slice here is that it opens up some optimization opportunities. For + /// example, if all of the HIR values are literals, then this routine might + /// re-shuffle them to make NFA epsilon closures substantially faster. + fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> { + // self.c_alt_iter(exprs.iter().map(|e| self.c(e))) + let literal_count = exprs + .iter() + .filter(|e| { + matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_))) + }) + .count(); + if literal_count <= 1 || literal_count < exprs.len() { + return self.c_alt_iter(exprs.iter().map(|e| self.c(e))); + } + + let mut trie = if self.is_reverse() { + LiteralTrie::reverse() + } else { + LiteralTrie::forward() + }; + for expr in exprs.iter() { + let literal = match *expr.kind() { + hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes, + _ => unreachable!(), + }; + trie.add(literal)?; + } + trie.compile(&mut self.builder.borrow_mut()) + } + + /// Compile an alternation, where each element yielded by the given + /// iterator represents an item in the alternation. If the iterator yields + /// no elements, then this compiles down to a "fail" state. + /// + /// In an alternation, expressions appearing earlier are "preferred" at + /// match time over expressions appearing later. At least, this is true + /// when using "leftmost first" match semantics. (If "leftmost longest" are + /// ever added in the future, then this preference order of priority would + /// not apply in that mode.) + fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError> where - I: Iterator<Item = Result<ThompsonRef, Error>>, + I: Iterator<Item = Result<ThompsonRef, BuildError>>, { - let first = it.next().expect("alternations must be non-empty")?; + let first = match it.next() { + None => return self.c_fail(), + Some(result) => result?, + }; let second = match it.next() { None => return Ok(first), Some(result) => result?, @@ -751,66 +1106,64 @@ impl Compiler { Ok(ThompsonRef { start: union, end }) } - fn c_group( + /// Compile the given capture sub-expression. `expr` should be the + /// sub-expression contained inside the capture. If "capture" states are + /// enabled, then they are added as appropriate. + /// + /// This accepts the pieces of a capture instead of a `hir::Capture` so + /// that it's easy to manufacture a "fake" group when necessary, e.g., for + /// adding the entire pattern as if it were a group in order to create + /// appropriate "capture" states in the NFA. + fn c_cap( &self, - kind: &hir::GroupKind, + index: u32, + name: Option<&str>, expr: &Hir, - ) -> Result<ThompsonRef, Error> { - if !self.config.get_captures() { - return self.c(expr); + ) -> Result<ThompsonRef, BuildError> { + match self.config.get_which_captures() { + // No capture states means we always skip them. + WhichCaptures::None => return self.c(expr), + // Implicit captures states means we only add when index==0 since + // index==0 implies the group is implicit. + WhichCaptures::Implicit if index > 0 => return self.c(expr), + _ => {} } - let (capi, name) = match *kind { - hir::GroupKind::NonCapturing => return self.c(expr), - hir::GroupKind::CaptureIndex(index) => (index, None), - hir::GroupKind::CaptureName { ref name, index } => { - (index, Some(Arc::from(&**name))) - } - }; - let start = self.add_capture_start(capi, name)?; + let start = self.add_capture_start(index, name)?; let inner = self.c(expr)?; - let end = self.add_capture_end(capi)?; - + let end = self.add_capture_end(index)?; self.patch(start, inner.start)?; self.patch(inner.end, end)?; Ok(ThompsonRef { start, end }) } + /// Compile the given repetition expression. This handles all types of + /// repetitions and greediness. fn c_repetition( &self, rep: &hir::Repetition, - ) -> Result<ThompsonRef, Error> { - match rep.kind { - hir::RepetitionKind::ZeroOrOne => { - self.c_zero_or_one(&rep.hir, rep.greedy) - } - hir::RepetitionKind::ZeroOrMore => { - self.c_at_least(&rep.hir, rep.greedy, 0) - } - hir::RepetitionKind::OneOrMore => { - self.c_at_least(&rep.hir, rep.greedy, 1) - } - hir::RepetitionKind::Range(ref rng) => match *rng { - hir::RepetitionRange::Exactly(count) => { - self.c_exactly(&rep.hir, count) - } - hir::RepetitionRange::AtLeast(m) => { - self.c_at_least(&rep.hir, rep.greedy, m) - } - hir::RepetitionRange::Bounded(min, max) => { - self.c_bounded(&rep.hir, rep.greedy, min, max) - } - }, + ) -> Result<ThompsonRef, BuildError> { + match (rep.min, rep.max) { + (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), + (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), + (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), + (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), } } + /// Compile the given expression such that it matches at least `min` times, + /// but no more than `max` times. + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otherwise, it will match as little as + /// possible. fn c_bounded( &self, expr: &Hir, greedy: bool, min: u32, max: u32, - ) -> Result<ThompsonRef, Error> { + ) -> Result<ThompsonRef, BuildError> { let prefix = self.c_exactly(expr, min)?; if min == max { return Ok(prefix); @@ -851,7 +1204,7 @@ impl Compiler { let union = if greedy { self.add_union() } else { - self.add_reverse_union() + self.add_union_reverse() }?; let compiled = self.c(expr)?; self.patch(prev_end, union)?; @@ -863,22 +1216,29 @@ impl Compiler { Ok(ThompsonRef { start: prefix.start, end: empty }) } + /// Compile the given expression such that it may be matched `n` or more + /// times, where `n` can be any integer. (Although a particularly large + /// integer is likely to run afoul of any configured size limits.) + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otherwise, it will match as little as + /// possible. fn c_at_least( &self, expr: &Hir, greedy: bool, n: u32, - ) -> Result<ThompsonRef, Error> { + ) -> Result<ThompsonRef, BuildError> { if n == 0 { // When the expression cannot match the empty string, then we // can get away with something much simpler: just one 'alt' // instruction that optionally repeats itself. But if the expr // can match the empty string... see below. - if !expr.is_match_empty() { + if expr.properties().minimum_len().map_or(false, |len| len > 0) { let union = if greedy { self.add_union() } else { - self.add_reverse_union() + self.add_union_reverse() }?; let compiled = self.c(expr)?; self.patch(union, compiled.start)?; @@ -898,7 +1258,7 @@ impl Compiler { let plus = if greedy { self.add_union() } else { - self.add_reverse_union() + self.add_union_reverse() }?; self.patch(compiled.end, plus)?; self.patch(plus, compiled.start)?; @@ -906,7 +1266,7 @@ impl Compiler { let question = if greedy { self.add_union() } else { - self.add_reverse_union() + self.add_union_reverse() }?; let empty = self.add_empty()?; self.patch(question, compiled.start)?; @@ -918,7 +1278,7 @@ impl Compiler { let union = if greedy { self.add_union() } else { - self.add_reverse_union() + self.add_union_reverse() }?; self.patch(compiled.end, union)?; self.patch(union, compiled.start)?; @@ -929,7 +1289,7 @@ impl Compiler { let union = if greedy { self.add_union() } else { - self.add_reverse_union() + self.add_union_reverse() }?; self.patch(prefix.end, last.start)?; self.patch(last.end, union)?; @@ -938,13 +1298,19 @@ impl Compiler { } } + /// Compile the given expression such that it may be matched zero or one + /// times. + /// + /// When `greedy` is true, then the preference is for the expression to + /// match as much as possible. Otherwise, it will match as little as + /// possible. fn c_zero_or_one( &self, expr: &Hir, greedy: bool, - ) -> Result<ThompsonRef, Error> { + ) -> Result<ThompsonRef, BuildError> { let union = - if greedy { self.add_union() } else { self.add_reverse_union() }?; + if greedy { self.add_union() } else { self.add_union_reverse() }?; let compiled = self.c(expr)?; let empty = self.add_empty()?; self.patch(union, compiled.start)?; @@ -953,15 +1319,30 @@ impl Compiler { Ok(ThompsonRef { start: union, end: empty }) } - fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> { + /// Compile the given HIR expression exactly `n` times. + fn c_exactly( + &self, + expr: &Hir, + n: u32, + ) -> Result<ThompsonRef, BuildError> { let it = (0..n).map(|_| self.c(expr)); self.c_concat(it) } + /// Compile the given byte oriented character class. + /// + /// This uses "sparse" states to represent an alternation between ranges in + /// this character class. We can use "sparse" states instead of stitching + /// together a "union" state because all ranges in a character class have + /// equal priority *and* are non-overlapping (thus, only one can match, so + /// there's never a question of priority in the first place). This saves a + /// fair bit of overhead when traversing an NFA. + /// + /// This routine compiles an empty character class into a "fail" state. fn c_byte_class( &self, cls: &hir::ClassBytes, - ) -> Result<ThompsonRef, Error> { + ) -> Result<ThompsonRef, BuildError> { let end = self.add_empty()?; let mut trans = Vec::with_capacity(cls.ranges().len()); for r in cls.iter() { @@ -974,22 +1355,36 @@ impl Compiler { Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) } + /// Compile the given Unicode character class. + /// + /// This routine specifically tries to use various types of compression, + /// since UTF-8 automata of large classes can get quite large. The specific + /// type of compression used depends on forward vs reverse compilation, and + /// whether NFA shrinking is enabled or not. + /// + /// Aside from repetitions causing lots of repeat group, this is like the + /// single most expensive part of regex compilation. Therefore, a large part + /// of the expense of compilation may be reduce by disabling Unicode in the + /// pattern. + /// + /// This routine compiles an empty character class into a "fail" state. fn c_unicode_class( &self, cls: &hir::ClassUnicode, - ) -> Result<ThompsonRef, Error> { + ) -> Result<ThompsonRef, BuildError> { // If all we have are ASCII ranges wrapped in a Unicode package, then // there is zero reason to bring out the big guns. We can fit all ASCII // ranges within a single sparse state. - if cls.is_all_ascii() { + if cls.is_ascii() { let end = self.add_empty()?; let mut trans = Vec::with_capacity(cls.ranges().len()); for r in cls.iter() { - assert!(r.start() <= '\x7F'); - assert!(r.end() <= '\x7F'); + // The unwraps below are OK because we've verified that this + // class only contains ASCII codepoints. trans.push(Transition { - start: r.start() as u8, - end: r.end() as u8, + // FIXME(1.59): use the 'TryFrom<char> for u8' impl. + start: u8::try_from(u32::from(r.start())).unwrap(), + end: u8::try_from(u32::from(r.end())).unwrap(), next: end, }); } @@ -1022,8 +1417,10 @@ impl Compiler { trie.insert(seq.as_slice()); } } + let mut builder = self.builder.borrow_mut(); let mut utf8_state = self.utf8_state.borrow_mut(); - let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?; + let mut utf8c = + Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; trie.iter(|seq| { utf8c.add(&seq)?; Ok(()) @@ -1035,8 +1432,10 @@ impl Compiler { // because we can stream it right into the UTF-8 compiler. There // is almost no downside (in either memory or time) to using this // approach. + let mut builder = self.builder.borrow_mut(); let mut utf8_state = self.utf8_state.borrow_mut(); - let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?; + let mut utf8c = + Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; for rng in cls.iter() { for seq in Utf8Sequences::new(rng.start(), rng.end()) { utf8c.add(seq.as_slice())?; @@ -1058,7 +1457,23 @@ impl Compiler { // // The code below is kept as a reference point in order to make it // easier to understand the higher level goal here. Although, it will - // almost certainly bit-rot, so keep that in mind. + // almost certainly bit-rot, so keep that in mind. Also, if you try to + // use it, some of the tests in this module will fail because they look + // for terser byte code produce by the more optimized handling above. + // But the integration test suite should still pass. + // + // One good example of the substantial difference this can make is to + // compare and contrast performance of the Pike VM when the code below + // is active vs the code above. Here's an example to try: + // + // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru' + // + // With Unicode classes generated below, this search takes about 45s on + // my machine. But with the compressed version above, the search takes + // only around 1.4s. The NFA is also 20% smaller. This is in part due + // to the compression, but also because of the utilization of 'sparse' + // NFA states. They lead to much less state shuffling during the NFA + // search. /* let it = cls .iter() @@ -1070,14 +1485,29 @@ impl Compiler { .map(|rng| self.c_range(rng.start, rng.end)); self.c_concat(it) }); - self.c_alternation(it) + self.c_alt_iter(it) */ } + /// Compile the given Unicode character class in reverse with suffix + /// caching. + /// + /// This is a "quick" way to compile large Unicode classes into reverse + /// UTF-8 automata while doing a small amount of compression on that + /// automata by reusing common suffixes. + /// + /// A more comprehensive compression scheme can be accomplished by using + /// a range trie to efficiently sort a reverse sequence of UTF-8 byte + /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`. + /// + /// This is the technique used when "NFA shrinking" is disabled. + /// + /// (This also tries to use "sparse" states where possible, just like + /// `c_byte_class` does.) fn c_unicode_class_reverse_with_suffix( &self, cls: &hir::ClassUnicode, - ) -> Result<ThompsonRef, Error> { + ) -> Result<ThompsonRef, BuildError> { // N.B. It would likely be better to cache common *prefixes* in the // reverse direction, but it's not quite clear how to do that. The // advantage of caching suffixes is that it does give us a win, and @@ -1113,229 +1543,178 @@ impl Compiler { Ok(ThompsonRef { start: union, end: alt_end }) } - fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> { + /// Compile the given HIR look-around assertion to an NFA look-around + /// assertion. + fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> { let look = match *anchor { - Anchor::StartLine => Look::StartLine, - Anchor::EndLine => Look::EndLine, - Anchor::StartText => Look::StartText, - Anchor::EndText => Look::EndText, + hir::Look::Start => Look::Start, + hir::Look::End => Look::End, + hir::Look::StartLF => Look::StartLF, + hir::Look::EndLF => Look::EndLF, + hir::Look::StartCRLF => Look::StartCRLF, + hir::Look::EndCRLF => Look::EndCRLF, + hir::Look::WordAscii => Look::WordAscii, + hir::Look::WordAsciiNegate => Look::WordAsciiNegate, + hir::Look::WordUnicode => Look::WordUnicode, + hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, }; let id = self.add_look(look)?; Ok(ThompsonRef { start: id, end: id }) } - fn c_word_boundary( - &self, - wb: &WordBoundary, - ) -> Result<ThompsonRef, Error> { - let look = match *wb { - WordBoundary::Unicode => Look::WordBoundaryUnicode, - WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate, - WordBoundary::Ascii => Look::WordBoundaryAscii, - WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate, - }; - let id = self.add_look(look)?; - Ok(ThompsonRef { start: id, end: id }) - } - - fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> { - let mut buf = [0; 4]; - let it = ch - .encode_utf8(&mut buf) - .as_bytes() - .iter() - .map(|&b| self.c_range(b, b)); - self.c_concat(it) + /// Compile the given byte string to a concatenation of bytes. + fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> { + self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b))) } - fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> { + /// Compile a "range" state with one transition that may only be followed + /// if the input byte is in the (inclusive) range given. + /// + /// Both the `start` and `end` locations point to the state created. + /// Callers will likely want to keep the `start`, but patch the `end` to + /// point to some other state. + fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> { let id = self.add_range(start, end)?; Ok(ThompsonRef { start: id, end: id }) } - fn c_empty(&self) -> Result<ThompsonRef, Error> { + /// Compile an "empty" state with one unconditional epsilon transition. + /// + /// Both the `start` and `end` locations point to the state created. + /// Callers will likely want to keep the `start`, but patch the `end` to + /// point to some other state. + fn c_empty(&self) -> Result<ThompsonRef, BuildError> { let id = self.add_empty()?; Ok(ThompsonRef { start: id, end: id }) } - fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> { - self.c_at_least(&Hir::any(false), false, 0) + /// Compile a "fail" state that can never have any outgoing transitions. + fn c_fail(&self) -> Result<ThompsonRef, BuildError> { + let id = self.add_fail()?; + Ok(ThompsonRef { start: id, end: id }) } - fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> { - self.c_at_least(&Hir::any(true), false, 0) - } + // The below helpers are meant to be simple wrappers around the + // corresponding Builder methods. For the most part, they let us write + // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where + // the latter is a mouthful. Some of the methods do inject a little bit + // of extra logic. e.g., Flipping look-around operators when compiling in + // reverse mode. - fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { - let old_memory_cstates = self.memory_cstates.get(); - match self.states.borrow_mut()[from] { - CState::Empty { ref mut next } => { - *next = to; - } - CState::Range { ref mut range } => { - range.next = to; - } - CState::Sparse { .. } => { - panic!("cannot patch from a sparse NFA state") - } - CState::Look { ref mut next, .. } => { - *next = to; - } - CState::Union { ref mut alternates } => { - alternates.push(to); - self.memory_cstates - .set(old_memory_cstates + mem::size_of::<StateID>()); - } - CState::UnionReverse { ref mut alternates } => { - alternates.push(to); - self.memory_cstates - .set(old_memory_cstates + mem::size_of::<StateID>()); - } - CState::CaptureStart { ref mut next, .. } => { - *next = to; - } - CState::CaptureEnd { ref mut next, .. } => { - *next = to; - } - CState::Match { .. } => {} - } - if old_memory_cstates != self.memory_cstates.get() { - self.check_nfa_size_limit()?; - } - Ok(()) + fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> { + self.builder.borrow_mut().patch(from, to) } - fn add_empty(&self) -> Result<StateID, Error> { - self.add_state(CState::Empty { next: StateID::ZERO }) + fn start_pattern(&self) -> Result<PatternID, BuildError> { + self.builder.borrow_mut().start_pattern() } - fn add_capture_start( + fn finish_pattern( &self, - capture_index: u32, - name: Option<Arc<str>>, - ) -> Result<StateID, Error> { - self.add_state(CState::CaptureStart { - next: StateID::ZERO, - capture_index, - name, - }) + start_id: StateID, + ) -> Result<PatternID, BuildError> { + self.builder.borrow_mut().finish_pattern(start_id) } - fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> { - self.add_state(CState::CaptureEnd { - next: StateID::ZERO, - capture_index, - }) + fn add_empty(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_empty() } - fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> { - let trans = Transition { start, end, next: StateID::ZERO }; - self.add_state(CState::Range { range: trans }) + fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_range(Transition { + start, + end, + next: StateID::ZERO, + }) } - fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> { - if ranges.len() == 1 { - self.add_state(CState::Range { range: ranges[0] }) - } else { - self.add_state(CState::Sparse { ranges }) - } + fn add_sparse( + &self, + ranges: Vec<Transition>, + ) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_sparse(ranges) } - fn add_look(&self, mut look: Look) -> Result<StateID, Error> { + fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> { if self.is_reverse() { look = look.reversed(); } - self.add_state(CState::Look { look, next: StateID::ZERO }) + self.builder.borrow_mut().add_look(StateID::ZERO, look) } - fn add_union(&self) -> Result<StateID, Error> { - self.add_state(CState::Union { alternates: vec![] }) + fn add_union(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_union(vec![]) } - fn add_reverse_union(&self) -> Result<StateID, Error> { - self.add_state(CState::UnionReverse { alternates: vec![] }) + fn add_union_reverse(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_union_reverse(vec![]) } - fn add_match( + fn add_capture_start( &self, - pattern_id: PatternID, - start_id: StateID, - ) -> Result<StateID, Error> { - self.add_state(CState::Match { pattern_id, start_id }) - } - - fn add_state(&self, state: CState) -> Result<StateID, Error> { - let mut states = self.states.borrow_mut(); - let id = StateID::new(states.len()) - .map_err(|_| Error::too_many_states(states.len()))?; - self.memory_cstates - .set(self.memory_cstates.get() + state.memory_usage()); - states.push(state); - // If we don't explicitly drop this, then 'nfa_memory_usage' will also - // try to borrow it when we check the size limit and hit an error. - drop(states); - self.check_nfa_size_limit()?; - Ok(id) + capture_index: u32, + name: Option<&str>, + ) -> Result<StateID, BuildError> { + let name = name.map(|n| Arc::from(n)); + self.builder.borrow_mut().add_capture_start( + StateID::ZERO, + capture_index, + name, + ) } - fn is_reverse(&self) -> bool { - self.config.get_reverse() + fn add_capture_end( + &self, + capture_index: u32, + ) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index) } - /// If an NFA size limit was set, this checks that the NFA compiled so far - /// fits within that limit. If so, then nothing is returned. Otherwise, an - /// error is returned. - /// - /// This should be called after increasing the heap usage of the - /// intermediate NFA. - /// - /// Note that this borrows 'self.states', so callers should ensure there is - /// no mutable borrow of it outstanding. - fn check_nfa_size_limit(&self) -> Result<(), Error> { - if let Some(limit) = self.config.get_nfa_size_limit() { - if self.nfa_memory_usage() > limit { - return Err(Error::exceeded_size_limit(limit)); - } - } - Ok(()) + fn add_fail(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_fail() } - /// Returns the heap memory usage, in bytes, of the NFA compiled so far. - /// - /// Note that this is an approximation of how big the final NFA will be. - /// In practice, the final NFA will likely be a bit smaller since it uses - /// things like `Box<[T]>` instead of `Vec<T>`. - fn nfa_memory_usage(&self) -> usize { - self.states.borrow().len() * mem::size_of::<CState>() - + self.memory_cstates.get() + fn add_match(&self) -> Result<StateID, BuildError> { + self.builder.borrow_mut().add_match() } -} -impl CState { - fn memory_usage(&self) -> usize { - match *self { - CState::Empty { .. } - | CState::Range { .. } - | CState::Look { .. } - | CState::CaptureStart { .. } - | CState::CaptureEnd { .. } - | CState::Match { .. } => 0, - CState::Sparse { ref ranges } => { - ranges.len() * mem::size_of::<Transition>() - } - CState::Union { ref alternates } => { - alternates.len() * mem::size_of::<StateID>() - } - CState::UnionReverse { ref alternates } => { - alternates.len() * mem::size_of::<StateID>() - } - } + fn is_reverse(&self) -> bool { + self.config.get_reverse() } } +/// A value that represents the result of compiling a sub-expression of a +/// regex's HIR. Specifically, this represents a sub-graph of the NFA that +/// has an initial state at `start` and a final state at `end`. +#[derive(Clone, Copy, Debug)] +pub(crate) struct ThompsonRef { + pub(crate) start: StateID, + pub(crate) end: StateID, +} + +/// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs +/// from a lexicographically sorted sequence of strings in linear time. +/// +/// The trick here is that any Unicode codepoint range can be converted to +/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them +/// together via an alternation is trivial, and indeed, it works. However, +/// there is a lot of redundant structure in many UTF-8 automatons. Since our +/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm +/// to build nearly minimal DFAs in linear time. (They are guaranteed to be +/// minimal because we use a bounded cache of previously build DFA states.) +/// +/// The drawback is that this sadly doesn't work for reverse automata, since +/// the ranges are no longer in lexicographic order. For that, we invented the +/// range trie (which gets its own module). Once a range trie is built, we then +/// use this same Utf8Compiler to build a reverse UTF-8 automaton. +/// +/// The high level idea is described here: +/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures +/// +/// There is also another implementation of this in the `fst` crate. #[derive(Debug)] struct Utf8Compiler<'a> { - nfac: &'a Compiler, + builder: &'a mut Builder, state: &'a mut Utf8State, target: StateID, } @@ -1371,24 +1750,24 @@ impl Utf8State { impl<'a> Utf8Compiler<'a> { fn new( - nfac: &'a Compiler, + builder: &'a mut Builder, state: &'a mut Utf8State, - ) -> Result<Utf8Compiler<'a>, Error> { - let target = nfac.add_empty()?; + ) -> Result<Utf8Compiler<'a>, BuildError> { + let target = builder.add_empty()?; state.clear(); - let mut utf8c = Utf8Compiler { nfac, state, target }; + let mut utf8c = Utf8Compiler { builder, state, target }; utf8c.add_empty(); Ok(utf8c) } - fn finish(&mut self) -> Result<ThompsonRef, Error> { + fn finish(&mut self) -> Result<ThompsonRef, BuildError> { self.compile_from(0)?; let node = self.pop_root(); let start = self.compile(node)?; Ok(ThompsonRef { start, end: self.target }) } - fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> { + fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> { let prefix_len = ranges .iter() .zip(&self.state.uncompiled) @@ -1404,7 +1783,7 @@ impl<'a> Utf8Compiler<'a> { Ok(()) } - fn compile_from(&mut self, from: usize) -> Result<(), Error> { + fn compile_from(&mut self, from: usize) -> Result<(), BuildError> { let mut next = self.target; while from + 1 < self.state.uncompiled.len() { let node = self.pop_freeze(next); @@ -1414,12 +1793,15 @@ impl<'a> Utf8Compiler<'a> { Ok(()) } - fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> { + fn compile( + &mut self, + node: Vec<Transition>, + ) -> Result<StateID, BuildError> { let hash = self.state.compiled.hash(&node); if let Some(id) = self.state.compiled.get(&node, hash) { return Ok(id); } - let id = self.nfac.add_sparse(node.clone())?; + let id = self.builder.add_sparse(node.clone())?; self.state.compiled.set(node, hash, id); Ok(id) } @@ -1486,16 +1868,22 @@ impl Utf8Node { #[cfg(test)] mod tests { - use alloc::vec::Vec; + use alloc::{vec, vec::Vec}; - use super::{ - Builder, Config, PatternID, SparseTransitions, State, StateID, - Transition, NFA, + use crate::{ + nfa::thompson::{SparseTransitions, State, Transition, NFA}, + util::primitives::{PatternID, SmallIndex, StateID}, }; + use super::*; + fn build(pattern: &str) -> NFA { - Builder::new() - .configure(Config::new().captures(false).unanchored_prefix(false)) + NFA::compiler() + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build(pattern) .unwrap() } @@ -1511,17 +1899,17 @@ mod tests { fn s_byte(byte: u8, next: usize) -> State { let next = sid(next); let trans = Transition { start: byte, end: byte, next }; - State::Range { range: trans } + State::ByteRange { trans } } fn s_range(start: u8, end: u8, next: usize) -> State { let next = sid(next); let trans = Transition { start, end, next }; - State::Range { range: trans } + State::ByteRange { trans } } - fn s_sparse(ranges: &[(u8, u8, usize)]) -> State { - let ranges = ranges + fn s_sparse(transitions: &[(u8, u8, usize)]) -> State { + let transitions = transitions .iter() .map(|&(start, end, next)| Transition { start, @@ -1529,7 +1917,11 @@ mod tests { next: sid(next), }) .collect(); - State::Sparse(SparseTransitions { ranges }) + State::Sparse(SparseTransitions { transitions }) + } + + fn s_bin_union(alt1: usize, alt2: usize) -> State { + State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } } fn s_union(alts: &[usize]) -> State { @@ -1542,34 +1934,35 @@ mod tests { } } + fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { + State::Capture { + next: sid(next), + pattern_id: pid(pattern), + group_index: SmallIndex::new(index).unwrap(), + slot: SmallIndex::new(slot).unwrap(), + } + } + + fn s_fail() -> State { + State::Fail + } + fn s_match(id: usize) -> State { - State::Match { id: pid(id) } + State::Match { pattern_id: pid(id) } } // Test that building an unanchored NFA has an appropriate `(?s:.)*?` // prefix. #[test] fn compile_unanchored_prefix() { - // When the machine can only match valid UTF-8. - let nfa = Builder::new() - .configure(Config::new().captures(false)) - .build(r"a") - .unwrap(); - // There should be many states since the `.` in `(?s:.)*?` matches any - // Unicode scalar value. - assert_eq!(11, nfa.len()); - assert_eq!(nfa.states[10], s_match(0)); - assert_eq!(nfa.states[9], s_byte(b'a', 10)); - - // When the machine can match through invalid UTF-8. - let nfa = Builder::new() - .configure(Config::new().captures(false).utf8(false)) + let nfa = NFA::compiler() + .configure(NFA::config().which_captures(WhichCaptures::None)) .build(r"a") .unwrap(); assert_eq!( - nfa.states, + nfa.states(), &[ - s_union(&[2, 1]), + s_bin_union(2, 1), s_range(0, 255, 0), s_byte(b'a', 3), s_match(0), @@ -1579,51 +1972,55 @@ mod tests { #[test] fn compile_empty() { - assert_eq!(build("").states, &[s_match(0),]); + assert_eq!(build("").states(), &[s_match(0),]); } #[test] fn compile_literal() { - assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]); + assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]); assert_eq!( - build("ab").states, + build("ab").states(), &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),] ); assert_eq!( - build("☃").states, + build("☃").states(), &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)] ); // Check that non-UTF-8 literals work. - let nfa = Builder::new() + let nfa = NFA::compiler() .configure( - Config::new() - .captures(false) - .utf8(false) + NFA::config() + .which_captures(WhichCaptures::None) .unanchored_prefix(false), ) - .syntax(crate::SyntaxConfig::new().utf8(false)) + .syntax(crate::util::syntax::Config::new().utf8(false)) .build(r"(?-u)\xFF") .unwrap(); - assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]); + assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]); } #[test] - fn compile_class() { + fn compile_class_ascii() { assert_eq!( - build(r"[a-z]").states, + build(r"[a-z]").states(), &[s_range(b'a', b'z', 1), s_match(0),] ); assert_eq!( - build(r"[x-za-c]").states, + build(r"[x-za-c]").states(), &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)] ); + } + + #[test] + #[cfg(not(miri))] + fn compile_class_unicode() { assert_eq!( - build(r"[\u03B1-\u03B4]").states, + build(r"[\u03B1-\u03B4]").states(), &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)] ); assert_eq!( - build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, + build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(), &[ s_range(0xB1, 0xB4, 5), s_range(0x99, 0x9E, 5), @@ -1634,7 +2031,7 @@ mod tests { ] ); assert_eq!( - build(r"[a-z☃]").states, + build(r"[a-z☃]").states(), &[ s_byte(0x83, 3), s_byte(0x98, 0), @@ -1647,67 +2044,214 @@ mod tests { #[test] fn compile_repetition() { assert_eq!( - build(r"a?").states, - &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),] + build(r"a?").states(), + &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),] ); assert_eq!( - build(r"a??").states, - &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),] + build(r"a??").states(), + &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),] ); } #[test] fn compile_group() { assert_eq!( - build(r"ab+").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)] + build(r"ab+").states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)] ); assert_eq!( - build(r"(ab)").states, + build(r"(ab)").states(), &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)] ); assert_eq!( - build(r"(ab)+").states, - &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)] + build(r"(ab)+").states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)] ); } #[test] fn compile_alternation() { assert_eq!( - build(r"a|b").states, - &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)] + build(r"a|b").states(), + &[s_range(b'a', b'b', 1), s_match(0)] + ); + assert_eq!( + build(r"ab|cd").states(), + &[ + s_byte(b'b', 3), + s_byte(b'd', 3), + s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]), + s_match(0) + ], ); assert_eq!( - build(r"|b").states, - &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)] + build(r"|b").states(), + &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)] ); assert_eq!( - build(r"a|").states, - &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)] + build(r"a|").states(), + &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)] ); } + // This tests the use of a non-binary union, i.e., a state with more than + // 2 unconditional epsilon transitions. The only place they tend to appear + // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union' + // and 'sparse' tend to cover all other cases of alternation. #[test] - fn many_start_pattern() { - let nfa = Builder::new() - .configure(Config::new().captures(false).unanchored_prefix(false)) + fn compile_non_binary_union() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .reverse(true) + .shrink(false) + .unanchored_prefix(false), + ) + .build(r"[\u1000\u2000\u3000]") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_union(&[3, 6, 9]), + s_byte(0xE1, 10), + s_byte(0x80, 1), + s_byte(0x80, 2), + s_byte(0xE2, 10), + s_byte(0x80, 4), + s_byte(0x80, 5), + s_byte(0xE3, 10), + s_byte(0x80, 7), + s_byte(0x80, 8), + s_match(0), + ] + ); + } + + #[test] + fn compile_many_start_pattern() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false), + ) .build_many(&["a", "b"]) .unwrap(); assert_eq!( - nfa.states, + nfa.states(), &[ s_byte(b'a', 1), s_match(0), s_byte(b'b', 3), s_match(1), - s_union(&[0, 2]), + s_bin_union(0, 2), ] ); assert_eq!(nfa.start_anchored().as_usize(), 4); assert_eq!(nfa.start_unanchored().as_usize(), 4); // Test that the start states for each individual pattern are correct. - assert_eq!(nfa.start_pattern(pid(0)), sid(0)); - assert_eq!(nfa.start_pattern(pid(1)), sid(2)); + assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0)); + assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2)); + } + + // This tests that our compiler can handle an empty character class. At the + // time of writing, the regex parser forbids it, so the only way to test it + // is to provide a hand written HIR. + #[test] + fn empty_class_bytes() { + use regex_syntax::hir::{Class, ClassBytes, Hir}; + + let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); + let nfa = + NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); + assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); + } + + // Like empty_class_bytes, but for a Unicode class. + #[test] + fn empty_class_unicode() { + use regex_syntax::hir::{Class, ClassUnicode, Hir}; + + let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); + let config = NFA::config() + .which_captures(WhichCaptures::None) + .unanchored_prefix(false); + let nfa = + NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); + assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); + } + + #[test] + fn compile_captures_all() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::All), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_cap(3, 0, 1, 2), + s_byte(b'b', 4), + s_cap(5, 0, 1, 3), + s_byte(b'c', 6), + s_cap(7, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(2, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_implicit() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::Implicit), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[ + s_cap(1, 0, 0, 0), + s_byte(b'a', 2), + s_byte(b'b', 3), + s_byte(b'c', 4), + s_cap(5, 0, 0, 1), + s_match(0) + ] + ); + let ginfo = nfa.group_info(); + assert_eq!(1, ginfo.all_group_len()); + } + + #[test] + fn compile_captures_none() { + let nfa = NFA::compiler() + .configure( + NFA::config() + .unanchored_prefix(false) + .which_captures(WhichCaptures::None), + ) + .build("a(b)c") + .unwrap(); + assert_eq!( + nfa.states(), + &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] + ); + let ginfo = nfa.group_info(); + assert_eq!(0, ginfo.all_group_len()); } } diff --git a/vendor/regex-automata/src/nfa/thompson/error.rs b/vendor/regex-automata/src/nfa/thompson/error.rs index 52f02e888..3c2fa8a21 100644 --- a/vendor/regex-automata/src/nfa/thompson/error.rs +++ b/vendor/regex-automata/src/nfa/thompson/error.rs @@ -1,6 +1,9 @@ -use crate::util::id::{PatternID, StateID}; +use crate::util::{ + captures, look, + primitives::{PatternID, StateID}, +}; -/// An error that can occured during the construction of a thompson NFA. +/// An error that can occurred during the construction of a thompson NFA. /// /// This error does not provide many introspection capabilities. There are /// generally only two things you can do with it: @@ -15,17 +18,27 @@ use crate::util::id::{PatternID, StateID}; /// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then /// building the NFA will fail. #[derive(Clone, Debug)] -pub struct Error { - kind: ErrorKind, +pub struct BuildError { + kind: BuildErrorKind, } /// The kind of error that occurred during the construction of a thompson NFA. #[derive(Clone, Debug)] -enum ErrorKind { +enum BuildErrorKind { /// An error that occurred while parsing a regular expression. Note that /// this error may be printed over multiple lines, and is generally /// intended to be end user readable on its own. + #[cfg(feature = "syntax")] Syntax(regex_syntax::Error), + /// An error that occurs if the capturing groups provided to an NFA builder + /// do not satisfy the documented invariants. For example, things like + /// too many groups, missing groups, having the first (zeroth) group be + /// named or duplicate group names within the same pattern. + Captures(captures::GroupInfoError), + /// An error that occurs when an NFA contains a Unicode word boundary, but + /// where the crate was compiled without the necessary data for dealing + /// with Unicode word boundaries. + Word(look::UnicodeWordBoundaryError), /// An error that occurs if too many patterns were given to the NFA /// compiler. TooManyPatterns { @@ -49,96 +62,123 @@ enum ErrorKind { limit: usize, }, /// An error that occurs when an invalid capture group index is added to - /// the NFA. An "invalid" index can be one that is too big (e.g., results - /// in an integer overflow) or one that is discontinuous from previous - /// capture group indices added. + /// the NFA. An "invalid" index can be one that would otherwise overflow + /// a `usize` on the current target. InvalidCaptureIndex { /// The invalid index that was given. - index: usize, + index: u32, }, - /// An error that occurs when an NFA contains a Unicode word boundary, but - /// where the crate was compiled without the necessary data for dealing - /// with Unicode word boundaries. - UnicodeWordUnavailable, + /// An error that occurs when one tries to build a reverse NFA with + /// captures enabled. Currently, this isn't supported, but we probably + /// should support it at some point. + #[cfg(feature = "syntax")] + UnsupportedCaptures, } -impl Error { - fn kind(&self) -> &ErrorKind { +impl BuildError { + /// If this error occurred because the NFA exceeded the configured size + /// limit before being built, then this returns the configured size limit. + /// + /// The limit returned is what was configured, and corresponds to the + /// maximum amount of heap usage in bytes. + pub fn size_limit(&self) -> Option<usize> { + match self.kind { + BuildErrorKind::ExceededSizeLimit { limit } => Some(limit), + _ => None, + } + } + + fn kind(&self) -> &BuildErrorKind { &self.kind } - pub(crate) fn syntax(err: regex_syntax::Error) -> Error { - Error { kind: ErrorKind::Syntax(err) } + #[cfg(feature = "syntax")] + pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError { + BuildError { kind: BuildErrorKind::Syntax(err) } + } + + pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError { + BuildError { kind: BuildErrorKind::Captures(err) } + } + + pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError { + BuildError { kind: BuildErrorKind::Word(err) } } - pub(crate) fn too_many_patterns(given: usize) -> Error { + pub(crate) fn too_many_patterns(given: usize) -> BuildError { let limit = PatternID::LIMIT; - Error { kind: ErrorKind::TooManyPatterns { given, limit } } + BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } } } - pub(crate) fn too_many_states(given: usize) -> Error { + pub(crate) fn too_many_states(given: usize) -> BuildError { let limit = StateID::LIMIT; - Error { kind: ErrorKind::TooManyStates { given, limit } } + BuildError { kind: BuildErrorKind::TooManyStates { given, limit } } } - pub(crate) fn exceeded_size_limit(limit: usize) -> Error { - Error { kind: ErrorKind::ExceededSizeLimit { limit } } + pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError { + BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } } - pub(crate) fn invalid_capture_index(index: usize) -> Error { - Error { kind: ErrorKind::InvalidCaptureIndex { index } } + pub(crate) fn invalid_capture_index(index: u32) -> BuildError { + BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } } } - pub(crate) fn unicode_word_unavailable() -> Error { - Error { kind: ErrorKind::UnicodeWordUnavailable } + #[cfg(feature = "syntax")] + pub(crate) fn unsupported_captures() -> BuildError { + BuildError { kind: BuildErrorKind::UnsupportedCaptures } } } #[cfg(feature = "std")] -impl std::error::Error for Error { +impl std::error::Error for BuildError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self.kind() { - ErrorKind::Syntax(ref err) => Some(err), - ErrorKind::TooManyPatterns { .. } => None, - ErrorKind::TooManyStates { .. } => None, - ErrorKind::ExceededSizeLimit { .. } => None, - ErrorKind::InvalidCaptureIndex { .. } => None, - ErrorKind::UnicodeWordUnavailable => None, + #[cfg(feature = "syntax")] + BuildErrorKind::Syntax(ref err) => Some(err), + BuildErrorKind::Captures(ref err) => Some(err), + _ => None, } } } -impl core::fmt::Display for Error { +impl core::fmt::Display for BuildError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self.kind() { - ErrorKind::Syntax(_) => write!(f, "error parsing regex"), - ErrorKind::TooManyPatterns { given, limit } => write!( + #[cfg(feature = "syntax")] + BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"), + BuildErrorKind::Captures(_) => { + write!(f, "error with capture groups") + } + BuildErrorKind::Word(_) => { + write!(f, "NFA contains Unicode word boundary") + } + BuildErrorKind::TooManyPatterns { given, limit } => write!( f, - "attemped to compile {} patterns, \ + "attempted to compile {} patterns, \ which exceeds the limit of {}", given, limit, ), - ErrorKind::TooManyStates { given, limit } => write!( + BuildErrorKind::TooManyStates { given, limit } => write!( f, - "attemped to compile {} NFA states, \ + "attempted to compile {} NFA states, \ which exceeds the limit of {}", given, limit, ), - ErrorKind::ExceededSizeLimit { limit } => write!( + BuildErrorKind::ExceededSizeLimit { limit } => write!( f, "heap usage during NFA compilation exceeded limit of {}", limit, ), - ErrorKind::InvalidCaptureIndex { index } => write!( + BuildErrorKind::InvalidCaptureIndex { index } => write!( f, "capture group index {} is invalid (too big or discontinuous)", index, ), - ErrorKind::UnicodeWordUnavailable => write!( + #[cfg(feature = "syntax")] + BuildErrorKind::UnsupportedCaptures => write!( f, - "crate has been compiled without Unicode word boundary \ - support, but the NFA contains Unicode word boundary \ - assertions", + "currently captures must be disabled when compiling \ + a reverse NFA", ), } } diff --git a/vendor/regex-automata/src/nfa/thompson/literal_trie.rs b/vendor/regex-automata/src/nfa/thompson/literal_trie.rs new file mode 100644 index 000000000..7ed129afd --- /dev/null +++ b/vendor/regex-automata/src/nfa/thompson/literal_trie.rs @@ -0,0 +1,528 @@ +use core::mem; + +use alloc::{vec, vec::Vec}; + +use crate::{ + nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder}, + util::primitives::{IteratorIndexExt, StateID}, +}; + +/// A trie that preserves leftmost-first match semantics. +/// +/// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN' +/// patterns. It can *only* handle alternations of literals, which makes it +/// somewhat restricted in its scope, but literal alternations are fairly +/// common. +/// +/// At a 5,000 foot level, the main idea of this trie is make an alternation of +/// literals look more like a DFA than an NFA via epsilon removal. +/// +/// More precisely, the main issue is in how alternations are compiled into +/// a Thompson NFA. Namely, each alternation gets a single NFA "union" state +/// with an epsilon transition for every branch of the alternation pointing to +/// an NFA state corresponding to the start of that branch. The main problem +/// with this representation is the cost of computing an epsilon closure. Once +/// you hit the alternation's start state, it acts as a sort of "clog" that +/// requires you to traverse all of the epsilon transitions to compute the full +/// closure. +/// +/// While fixing such clogs in the general case is pretty tricky without going +/// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems). +/// But at least in the case of an alternation of literals, we can convert +/// that to a prefix trie without too much cost. In theory, that's all you +/// really need to do: build the trie and then compile it to a Thompson NFA. +/// For example, if you have the pattern 'bar|baz|foo', then using a trie, it +/// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by +/// reducing the number of epsilon transitions out of the alternation's start +/// state from 3 to 2 (it actually gets down to 1 when you use a sparse state, +/// which we do below). It's a small effect here, but when your alternation is +/// huge, the savings is also huge. +/// +/// And that is... essentially what a LiteralTrie does. But there is one +/// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile +/// that when leftmost-first semantics are used? If 'sam|samwise' was the +/// entire regex, then you could just drop the 'samwise' branch entirely since +/// it is impossible to match ('sam' will always take priority, and since it +/// is a prefix of 'samwise', 'samwise' will never match). But what about the +/// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because +/// it might match when 'sam' doesn't fall on a word boundary. +/// +/// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)', +/// which is a precisely equivalent regex that also gets rid of the clog. +/// +/// Another example is 'zapper|z|zap'. That gets translated to +/// 'z(?:apper||ap)'. +/// +/// We accomplish this by giving each state in the trie multiple "chunks" of +/// transitions. Each chunk barrier represents a match. The idea is that once +/// you know a match occurs, none of the transitions after the match can be +/// re-ordered and mixed in with the transitions before the match. Otherwise, +/// the match semantics could be changed. +/// +/// See the 'State' data type for a bit more detail. +/// +/// Future work: +/// +/// * In theory, it would be nice to generalize the idea of removing clogs and +/// apply it to the NFA graph itself. Then this could in theory work for +/// case insensitive alternations of literals, or even just alternations where +/// each branch starts with a non-epsilon transition. +/// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick +/// crate deals with leftmost-first matches correctly, but I think this implies +/// encoding failure transitions into a Thompson NFA somehow. Which seems fine, +/// because failure transitions are just unconditional epsilon transitions? +/// * Or perhaps even better, could we use an aho_corasick::AhoCorasick +/// directly? At time of writing, 0.7 is the current version of the +/// aho-corasick crate, and that definitely cannot be used as-is. But if we +/// expose the underlying finite state machine API, then could we use it? That +/// would be super. If we could figure that out, it might also lend itself to +/// more general composition of finite state machines. +#[derive(Clone)] +pub(crate) struct LiteralTrie { + /// The set of trie states. Each state contains one or more chunks, where + /// each chunk is a sparse set of transitions to other states. A leaf state + /// is always a match state that contains only empty chunks (i.e., no + /// transitions). + states: Vec<State>, + /// Whether to add literals in reverse to the trie. Useful when building + /// a reverse NFA automaton. + rev: bool, +} + +impl LiteralTrie { + /// Create a new literal trie that adds literals in the forward direction. + pub(crate) fn forward() -> LiteralTrie { + let root = State::default(); + LiteralTrie { states: vec![root], rev: false } + } + + /// Create a new literal trie that adds literals in reverse. + pub(crate) fn reverse() -> LiteralTrie { + let root = State::default(); + LiteralTrie { states: vec![root], rev: true } + } + + /// Add the given literal to this trie. + /// + /// If the literal could not be added because the `StateID` space was + /// exhausted, then an error is returned. If an error returns, the trie + /// is in an unspecified state. + pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> { + let mut prev = StateID::ZERO; + let mut it = bytes.iter().copied(); + while let Some(b) = if self.rev { it.next_back() } else { it.next() } { + prev = self.get_or_add_state(prev, b)?; + } + self.states[prev].add_match(); + Ok(()) + } + + /// If the given transition is defined, then return the next state ID. + /// Otherwise, add the transition to `from` and point it to a new state. + /// + /// If a new state ID could not be allocated, then an error is returned. + fn get_or_add_state( + &mut self, + from: StateID, + byte: u8, + ) -> Result<StateID, BuildError> { + let active = self.states[from].active_chunk(); + match active.binary_search_by_key(&byte, |t| t.byte) { + Ok(i) => Ok(active[i].next), + Err(i) => { + // Add a new state and get its ID. + let next = StateID::new(self.states.len()).map_err(|_| { + BuildError::too_many_states(self.states.len()) + })?; + self.states.push(State::default()); + // Offset our position to account for all transitions and not + // just the ones in the active chunk. + let i = self.states[from].active_chunk_start() + i; + let t = Transition { byte, next }; + self.states[from].transitions.insert(i, t); + Ok(next) + } + } + } + + /// Compile this literal trie to the NFA builder given. + /// + /// This forwards any errors that may occur while using the given builder. + pub(crate) fn compile( + &self, + builder: &mut Builder, + ) -> Result<ThompsonRef, BuildError> { + // Compilation proceeds via depth-first traversal of the trie. + // + // This is overall pretty brutal. The recursive version of this is + // deliciously simple. (See 'compile_to_hir' below for what it might + // look like.) But recursion on a trie means your call stack grows + // in accordance with the longest literal, which just does not seem + // appropriate. So we push the call stack to the heap. But as a result, + // the trie traversal becomes pretty brutal because we essentially + // have to encode the state of a double for-loop into an explicit call + // frame. If someone can simplify this without using recursion, that'd + // be great. + + // 'end' is our match state for this trie, but represented in the the + // NFA. Any time we see a match in the trie, we insert a transition + // from the current state we're in to 'end'. + let end = builder.add_empty()?; + let mut stack = vec![]; + let mut f = Frame::new(&self.states[StateID::ZERO]); + loop { + if let Some(t) = f.transitions.next() { + if self.states[t.next].is_leaf() { + f.sparse.push(thompson::Transition { + start: t.byte, + end: t.byte, + next: end, + }); + } else { + f.sparse.push(thompson::Transition { + start: t.byte, + end: t.byte, + // This is a little funny, but when the frame we create + // below completes, it will pop this parent frame off + // and modify this transition to point to the correct + // state. + next: StateID::ZERO, + }); + stack.push(f); + f = Frame::new(&self.states[t.next]); + } + continue; + } + // At this point, we have visited all transitions in f.chunk, so + // add it as a sparse NFA state. Unless the chunk was empty, in + // which case, we don't do anything. + if !f.sparse.is_empty() { + let chunk_id = if f.sparse.len() == 1 { + builder.add_range(f.sparse.pop().unwrap())? + } else { + let sparse = mem::replace(&mut f.sparse, vec![]); + builder.add_sparse(sparse)? + }; + f.union.push(chunk_id); + } + // Now we need to look to see if there are other chunks to visit. + if let Some(chunk) = f.chunks.next() { + // If we're here, it means we're on the second (or greater) + // chunk, which implies there is a match at this point. So + // connect this state to the final end state. + f.union.push(end); + // Advance to the next chunk. + f.transitions = chunk.iter(); + continue; + } + // Now that we are out of chunks, we have completely visited + // this state. So turn our union of chunks into an NFA union + // state, and add that union state to the parent state's current + // sparse state. (If there is no parent, we're done.) + let start = builder.add_union(f.union)?; + match stack.pop() { + None => { + return Ok(ThompsonRef { start, end }); + } + Some(mut parent) => { + // OK because the only way a frame gets pushed on to the + // stack (aside from the root) is when a transition has + // been added to 'sparse'. + parent.sparse.last_mut().unwrap().next = start; + f = parent; + } + } + } + } + + /// Converts this trie to an equivalent HIR expression. + /// + /// We don't actually use this, but it's useful for tests. In particular, + /// it provides a (somewhat) human readable representation of the trie + /// itself. + #[cfg(test)] + fn compile_to_hir(&self) -> regex_syntax::hir::Hir { + self.compile_state_to_hir(StateID::ZERO) + } + + /// The recursive implementation of 'to_hir'. + /// + /// Notice how simple this is compared to 'compile' above. 'compile' could + /// be similarly simple, but we opt to not use recursion in order to avoid + /// overflowing the stack in the case of a longer literal. + #[cfg(test)] + fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir { + use regex_syntax::hir::Hir; + + let mut alt = vec![]; + for (i, chunk) in self.states[sid].chunks().enumerate() { + if i > 0 { + alt.push(Hir::empty()); + } + if chunk.is_empty() { + continue; + } + let mut chunk_alt = vec![]; + for t in chunk.iter() { + chunk_alt.push(Hir::concat(vec![ + Hir::literal(vec![t.byte]), + self.compile_state_to_hir(t.next), + ])); + } + alt.push(Hir::alternation(chunk_alt)); + } + Hir::alternation(alt) + } +} + +impl core::fmt::Debug for LiteralTrie { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + writeln!(f, "LiteralTrie(")?; + for (sid, state) in self.states.iter().with_state_ids() { + writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?; + } + writeln!(f, ")")?; + Ok(()) + } +} + +/// An explicit stack frame used for traversing the trie without using +/// recursion. +/// +/// Each frame is tied to the traversal of a single trie state. The frame is +/// dropped once the entire state (and all of its children) have been visited. +/// The "output" of compiling a state is the 'union' vector, which is turn +/// converted to a NFA union state. Each branch of the union corresponds to a +/// chunk in the trie state. +/// +/// 'sparse' corresponds to the set of transitions for a particular chunk in a +/// trie state. It is ultimately converted to an NFA sparse state. The 'sparse' +/// field, after being converted to a sparse NFA state, is reused for any +/// subsequent chunks in the trie state, if any exist. +#[derive(Debug)] +struct Frame<'a> { + /// The remaining chunks to visit for a trie state. + chunks: StateChunksIter<'a>, + /// The transitions of the current chunk that we're iterating over. Since + /// every trie state has at least one chunk, every frame is initialized + /// with the first chunk's transitions ready to be consumed. + transitions: core::slice::Iter<'a, Transition>, + /// The NFA state IDs pointing to the start of each chunk compiled by + /// this trie state. This ultimately gets converted to an NFA union once + /// the entire trie state (and all of its children) have been compiled. + /// The order of these matters for leftmost-first match semantics, since + /// earlier matches in the union are preferred over later ones. + union: Vec<StateID>, + /// The actual NFA transitions for a single chunk in a trie state. This + /// gets converted to an NFA sparse state, and its corresponding NFA state + /// ID should get added to 'union'. + sparse: Vec<thompson::Transition>, +} + +impl<'a> Frame<'a> { + /// Create a new stack frame for trie traversal. This initializes the + /// 'transitions' iterator to the transitions for the first chunk, with the + /// 'chunks' iterator being every chunk after the first one. + fn new(state: &'a State) -> Frame<'a> { + let mut chunks = state.chunks(); + // every state has at least 1 chunk + let chunk = chunks.next().unwrap(); + let transitions = chunk.iter(); + Frame { chunks, transitions, union: vec![], sparse: vec![] } + } +} + +/// A state in a trie. +/// +/// This uses a sparse representation. Since we don't use literal tries +/// for searching, and ultimately (and compilation requires visiting every +/// transition anyway), we use a sparse representation for transitions. This +/// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps +/// a bit slower. +/// +/// While 'transitions' is pretty standard as far as tries goes, the 'chunks' +/// piece here is more unusual. In effect, 'chunks' defines a partitioning +/// of 'transitions', where each chunk corresponds to a distinct set of +/// transitions. The key invariant is that a transition in one chunk cannot +/// be moved to another chunk. This is the secret sauce that preserve +/// leftmost-first match semantics. +/// +/// A new chunk is added whenever we mark a state as a match state. Once a +/// new chunk is added, the old active chunk is frozen and is never mutated +/// again. The new chunk becomes the active chunk, which is defined as +/// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where +/// 'chunks' is empty actually contains one chunk. Thus, every state contains +/// at least one (possibly empty) chunk. +/// +/// A "leaf" state is a state that has no outgoing transitions (so +/// 'transitions' is empty). Note that there is no way for a leaf state to be a +/// non-matching state. (Although while building the trie, within 'add', a leaf +/// state may exist while not containing any matches. But this invariant is +/// only broken within 'add'. Once 'add' returns, the invariant is upheld.) +#[derive(Clone, Default)] +struct State { + transitions: Vec<Transition>, + chunks: Vec<(usize, usize)>, +} + +impl State { + /// Mark this state as a match state and freeze the active chunk such that + /// it can not be further mutated. + fn add_match(&mut self) { + // This is not strictly necessary, but there's no point in recording + // another match by adding another chunk if the state has no + // transitions. Note though that we only skip this if we already know + // this is a match state, which is only true if 'chunks' is not empty. + // Basically, if we didn't do this, nothing semantically would change, + // but we'd end up pushing another chunk and potentially triggering an + // alloc. + if self.transitions.is_empty() && !self.chunks.is_empty() { + return; + } + let chunk_start = self.active_chunk_start(); + let chunk_end = self.transitions.len(); + self.chunks.push((chunk_start, chunk_end)); + } + + /// Returns true if and only if this state is a leaf state. That is, a + /// state that has no outgoing transitions. + fn is_leaf(&self) -> bool { + self.transitions.is_empty() + } + + /// Returns an iterator over all of the chunks (including the currently + /// active chunk) in this state. Since the active chunk is included, the + /// iterator is guaranteed to always yield at least one chunk (although the + /// chunk may be empty). + fn chunks(&self) -> StateChunksIter<'_> { + StateChunksIter { + transitions: &*self.transitions, + chunks: self.chunks.iter(), + active: Some(self.active_chunk()), + } + } + + /// Returns the active chunk as a slice of transitions. + fn active_chunk(&self) -> &[Transition] { + let start = self.active_chunk_start(); + &self.transitions[start..] + } + + /// Returns the index into 'transitions' where the active chunk starts. + fn active_chunk_start(&self) -> usize { + self.chunks.last().map_or(0, |&(_, end)| end) + } +} + +impl core::fmt::Debug for State { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut spacing = " "; + for (i, chunk) in self.chunks().enumerate() { + if i > 0 { + write!(f, "{}MATCH", spacing)?; + } + spacing = ""; + for (j, t) in chunk.iter().enumerate() { + spacing = " "; + if j == 0 && i > 0 { + write!(f, " ")?; + } else if j > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", t)?; + } + } + Ok(()) + } +} + +/// An iterator over all of the chunks in a state, including the active chunk. +/// +/// This iterator is created by `State::chunks`. We name this iterator so that +/// we can include it in the `Frame` type for non-recursive trie traversal. +#[derive(Debug)] +struct StateChunksIter<'a> { + transitions: &'a [Transition], + chunks: core::slice::Iter<'a, (usize, usize)>, + active: Option<&'a [Transition]>, +} + +impl<'a> Iterator for StateChunksIter<'a> { + type Item = &'a [Transition]; + + fn next(&mut self) -> Option<&'a [Transition]> { + if let Some(&(start, end)) = self.chunks.next() { + return Some(&self.transitions[start..end]); + } + if let Some(chunk) = self.active.take() { + return Some(chunk); + } + None + } +} + +/// A single transition in a trie to another state. +#[derive(Clone, Copy)] +struct Transition { + byte: u8, + next: StateID, +} + +impl core::fmt::Debug for Transition { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "{:?} => {}", + crate::util::escape::DebugByte(self.byte), + self.next.as_usize() + ) + } +} + +#[cfg(test)] +mod tests { + use bstr::B; + use regex_syntax::hir::Hir; + + use super::*; + + #[test] + fn zap() { + let mut trie = LiteralTrie::forward(); + trie.add(b"zapper").unwrap(); + trie.add(b"z").unwrap(); + trie.add(b"zap").unwrap(); + + let got = trie.compile_to_hir(); + let expected = Hir::concat(vec![ + Hir::literal(B("z")), + Hir::alternation(vec![ + Hir::literal(B("apper")), + Hir::empty(), + Hir::literal(B("ap")), + ]), + ]); + assert_eq!(expected, got); + } + + #[test] + fn maker() { + let mut trie = LiteralTrie::forward(); + trie.add(b"make").unwrap(); + trie.add(b"maple").unwrap(); + trie.add(b"maker").unwrap(); + + let got = trie.compile_to_hir(); + let expected = Hir::concat(vec![ + Hir::literal(B("ma")), + Hir::alternation(vec![ + Hir::concat(vec![ + Hir::literal(B("ke")), + Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]), + ]), + Hir::literal(B("ple")), + ]), + ]); + assert_eq!(expected, got); + } +} diff --git a/vendor/regex-automata/src/nfa/thompson/map.rs b/vendor/regex-automata/src/nfa/thompson/map.rs index 79ff63ca3..c36ce5386 100644 --- a/vendor/regex-automata/src/nfa/thompson/map.rs +++ b/vendor/regex-automata/src/nfa/thompson/map.rs @@ -25,17 +25,23 @@ // fast as the naive approach and typically winds up using less memory (since // it generates smaller NFAs) despite the presence of the cache. // -// These maps effectively represent caching mechanisms for CState::Sparse and -// CState::Range, respectively. The former represents a single NFA state with -// many transitions of equivalent priority while the latter represents a single -// NFA state with a single transition. (Neither state ever has or is an -// epsilon transition.) Thus, they have different key types. It's likely we -// could make one generic map, but the machinery didn't seem worth it. They -// are simple enough. +// These maps effectively represent caching mechanisms for sparse and +// byte-range NFA states, respectively. The former represents a single NFA +// state with many transitions of equivalent priority while the latter +// represents a single NFA state with a single transition. (Neither state ever +// has or is an epsilon transition.) Thus, they have different key types. It's +// likely we could make one generic map, but the machinery didn't seem worth +// it. They are simple enough. use alloc::{vec, vec::Vec}; -use crate::{nfa::thompson::Transition, util::id::StateID}; +use crate::{ + nfa::thompson::Transition, + util::{ + int::{Usize, U64}, + primitives::StateID, + }, +}; // Basic FNV-1a hash constants as described in: // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function @@ -137,11 +143,11 @@ impl Utf8BoundedMap { pub fn hash(&self, key: &[Transition]) -> usize { let mut h = INIT; for t in key { - h = (h ^ (t.start as u64)).wrapping_mul(PRIME); - h = (h ^ (t.end as u64)).wrapping_mul(PRIME); - h = (h ^ (t.next.as_usize() as u64)).wrapping_mul(PRIME); + h = (h ^ u64::from(t.start)).wrapping_mul(PRIME); + h = (h ^ u64::from(t.end)).wrapping_mul(PRIME); + h = (h ^ t.next.as_u64()).wrapping_mul(PRIME); } - (h as usize) % self.map.len() + (h % self.map.len().as_u64()).as_usize() } /// Retrieve the cached state ID corresponding to the given key. The hash @@ -252,10 +258,10 @@ impl Utf8SuffixMap { const INIT: u64 = 14695981039346656037; let mut h = INIT; - h = (h ^ (key.from.as_usize() as u64)).wrapping_mul(PRIME); - h = (h ^ (key.start as u64)).wrapping_mul(PRIME); - h = (h ^ (key.end as u64)).wrapping_mul(PRIME); - (h as usize) % self.map.len() + h = (h ^ key.from.as_u64()).wrapping_mul(PRIME); + h = (h ^ u64::from(key.start)).wrapping_mul(PRIME); + h = (h ^ u64::from(key.end)).wrapping_mul(PRIME); + (h % self.map.len().as_u64()).as_usize() } /// Retrieve the cached state ID corresponding to the given key. The hash diff --git a/vendor/regex-automata/src/nfa/thompson/mod.rs b/vendor/regex-automata/src/nfa/thompson/mod.rs index 88a438e8e..cf426736d 100644 --- a/vendor/regex-automata/src/nfa/thompson/mod.rs +++ b/vendor/regex-automata/src/nfa/thompson/mod.rs @@ -1,1555 +1,81 @@ -use core::{convert::TryFrom, fmt, mem, ops::Range}; - -use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; - -use crate::util::{ - alphabet::{self, ByteClassSet}, - decode_last_utf8, decode_utf8, - id::{IteratorIDExt, PatternID, PatternIDIter, StateID}, - is_word_byte, is_word_char_fwd, is_word_char_rev, -}; - -pub use self::{ - compiler::{Builder, Config}, - error::Error, -}; - +/*! +Defines a Thompson NFA and provides the [`PikeVM`](pikevm::PikeVM) and +[`BoundedBacktracker`](backtrack::BoundedBacktracker) regex engines. + +A Thompson NFA (non-deterministic finite automaton) is arguably _the_ central +data type in this library. It is the result of what is commonly referred to as +"regex compilation." That is, turning a regex pattern from its concrete syntax +string into something that can run a search looks roughly like this: + +* A `&str` is parsed into a [`regex-syntax::ast::Ast`](regex_syntax::ast::Ast). +* An `Ast` is translated into a [`regex-syntax::hir::Hir`](regex_syntax::hir::Hir). +* An `Hir` is compiled into a [`NFA`]. +* The `NFA` is then used to build one of a few different regex engines: + * An `NFA` is used directly in the `PikeVM` and `BoundedBacktracker` engines. + * An `NFA` is used by a [hybrid NFA/DFA](crate::hybrid) to build out a DFA's + transition table at search time. + * An `NFA`, assuming it is one-pass, is used to build a full + [one-pass DFA](crate::dfa::onepass) ahead of time. + * An `NFA` is used to build a [full DFA](crate::dfa) ahead of time. + +The [`meta`](crate::meta) regex engine makes all of these choices for you based +on various criteria. However, if you have a lower level use case, _you_ can +build any of the above regex engines and use them directly. But you must start +here by building an `NFA`. + +# Details + +It is perhaps worth expanding a bit more on what it means to go through the +`&str`->`Ast`->`Hir`->`NFA` process. + +* Parsing a string into an `Ast` gives it a structured representation. +Crucially, the size and amount of work done in this step is proportional to the +size of the original string. No optimization or Unicode handling is done at +this point. This means that parsing into an `Ast` has very predictable costs. +Moreover, an `Ast` can be roundtripped back to its original pattern string as +written. +* Translating an `Ast` into an `Hir` is a process by which the structured +representation is simplified down to its most fundamental components. +Translation deals with flags such as case insensitivity by converting things +like `(?i:a)` to `[Aa]`. Translation is also where Unicode tables are consulted +to resolve things like `\p{Emoji}` and `\p{Greek}`. It also flattens each +character class, regardless of how deeply nested it is, into a single sequence +of non-overlapping ranges. All the various literal forms are thrown out in +favor of one common representation. Overall, the `Hir` is small enough to fit +into your head and makes analysis and other tasks much simpler. +* Compiling an `Hir` into an `NFA` formulates the regex into a finite state +machine whose transitions are defined over bytes. For example, an `Hir` might +have a Unicode character class corresponding to a sequence of ranges defined +in terms of `char`. Compilation is then responsible for turning those ranges +into a UTF-8 automaton. That is, an automaton that matches the UTF-8 encoding +of just the codepoints specified by those ranges. Otherwise, the main job of +an `NFA` is to serve as a byte-code of sorts for a virtual machine. It can be +seen as a sequence of instructions for how to match a regex. +*/ + +#[cfg(feature = "nfa-backtrack")] +pub mod backtrack; +mod builder; +#[cfg(feature = "syntax")] mod compiler; mod error; +#[cfg(feature = "syntax")] +mod literal_trie; +#[cfg(feature = "syntax")] mod map; +mod nfa; +#[cfg(feature = "nfa-pikevm")] pub mod pikevm; +#[cfg(feature = "syntax")] mod range_trie; -/// A map from capture group name to its corresponding capture index. -/// -/// Since there are always two slots for each capture index, the pair of slots -/// corresponding to the capture index for a pattern ID of 0 are indexed at -/// `map["<name>"] * 2` and `map["<name>"] * 2 + 1`. -/// -/// This type is actually wrapped inside a Vec indexed by pattern ID on the -/// NFA, since multiple patterns may have the same capture group name. -/// -/// Note that this is somewhat of a sub-optimal representation, since it -/// requires a hashmap for each pattern. A better representation would be -/// HashMap<(PatternID, Arc<str>), usize>, but this makes it difficult to look -/// up a capture index by name without producing a `Arc<str>`, which requires -/// an allocation. To fix this, I think we'd need to define our own unsized -/// type or something? -#[cfg(feature = "std")] -type CaptureNameMap = std::collections::HashMap<Arc<str>, usize>; -#[cfg(not(feature = "std"))] -type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, usize>; - -// The NFA API below is not something I'm terribly proud of at the moment. In -// particular, it supports both mutating the NFA and actually using the NFA to -// perform a search. I think combining these two things muddies the waters a -// bit too much. -// -// I think the issue is that I saw the compiler as the 'builder,' and where -// the compiler had the ability to manipulate the internal state of the NFA. -// However, one of my goals was to make it possible for others to build their -// own NFAs in a way that is *not* couple to the regex-syntax crate. -// -// So I think really, there should be an NFA, a NFABuilder and then the -// internal compiler which uses the NFABuilder API to build an NFA. Alas, at -// the time of writing, I kind of ran out of steam. - -/// A fully compiled Thompson NFA. -/// -/// The states of the NFA are indexed by state IDs, which are how transitions -/// are expressed. -#[derive(Clone)] -pub struct NFA { - /// The state list. This list is guaranteed to be indexable by all starting - /// state IDs, and it is also guaranteed to contain at most one `Match` - /// state for each pattern compiled into this NFA. (A pattern may not have - /// a corresponding `Match` state if a `Match` state is impossible to - /// reach.) - states: Vec<State>, - /// The anchored starting state of this NFA. - start_anchored: StateID, - /// The unanchored starting state of this NFA. - start_unanchored: StateID, - /// The starting states for each individual pattern. Starting at any - /// of these states will result in only an anchored search for the - /// corresponding pattern. The vec is indexed by pattern ID. When the NFA - /// contains a single regex, then `start_pattern[0]` and `start_anchored` - /// are always equivalent. - start_pattern: Vec<StateID>, - /// A map from PatternID to its corresponding range of capture slots. Each - /// range is guaranteed to be contiguous with the previous range. The - /// end of the last range corresponds to the total number of slots needed - /// for this NFA. - patterns_to_slots: Vec<Range<usize>>, - /// A map from capture name to its corresponding index. So e.g., given - /// a single regex like '(\w+) (\w+) (?P<word>\w+)', the capture name - /// 'word' for pattern ID=0 would corresponding to the index '3'. Its - /// corresponding slots would then be '3 * 2 = 6' and '3 * 2 + 1 = 7'. - capture_name_to_index: Vec<CaptureNameMap>, - /// A map from pattern ID to capture group index to name, if one exists. - /// This is effectively the inverse of 'capture_name_to_index'. The outer - /// vec is indexed by pattern ID, while the inner vec is index by capture - /// index offset for the corresponding pattern. - /// - /// The first capture group for each pattern is always unnamed and is thus - /// always None. - capture_index_to_name: Vec<Vec<Option<Arc<str>>>>, - /// A representation of equivalence classes over the transitions in this - /// NFA. Two bytes in the same equivalence class must not discriminate - /// between a match or a non-match. This map can be used to shrink the - /// total size of a DFA's transition table with a small match-time cost. - /// - /// Note that the NFA's transitions are *not* defined in terms of these - /// equivalence classes. The NFA's transitions are defined on the original - /// byte values. For the most part, this is because they wouldn't really - /// help the NFA much since the NFA already uses a sparse representation - /// to represent transitions. Byte classes are most effective in a dense - /// representation. - byte_class_set: ByteClassSet, - /// Various facts about this NFA, which can be used to improve failure - /// modes (e.g., rejecting DFA construction if an NFA has Unicode word - /// boundaries) or for performing optimizations (avoiding an increase in - /// states if there are no look-around states). - facts: Facts, - /// Heap memory used indirectly by NFA states. Since each state might use a - /// different amount of heap, we need to keep track of this incrementally. - memory_states: usize, -} - -impl NFA { - pub fn config() -> Config { - Config::new() - } - - pub fn builder() -> Builder { - Builder::new() - } - - /// Returns an NFA with no states. Its match semantics are unspecified. - /// - /// An empty NFA is useful as a starting point for building one. It is - /// itself not intended to be used for matching. For example, its starting - /// state identifiers are configured to be `0`, but since it has no states, - /// the identifiers are invalid. - /// - /// If you need an NFA that never matches is anything and can be correctly - /// used for matching, use [`NFA::never_match`]. - #[inline] - pub fn empty() -> NFA { - NFA { - states: vec![], - start_anchored: StateID::ZERO, - start_unanchored: StateID::ZERO, - start_pattern: vec![], - patterns_to_slots: vec![], - capture_name_to_index: vec![], - capture_index_to_name: vec![], - byte_class_set: ByteClassSet::empty(), - facts: Facts::default(), - memory_states: 0, - } - } - - /// Returns an NFA with a single regex that always matches at every - /// position. - #[inline] - pub fn always_match() -> NFA { - let mut nfa = NFA::empty(); - // Since we're only adding one pattern, these are guaranteed to work. - let start = nfa.add_match().unwrap(); - assert_eq!(start.as_usize(), 0); - let pid = nfa.finish_pattern(start).unwrap(); - assert_eq!(pid.as_usize(), 0); - nfa - } - - /// Returns an NFA that never matches at any position. It contains no - /// regexes. - #[inline] - pub fn never_match() -> NFA { - let mut nfa = NFA::empty(); - // Since we're only adding one state, this can never fail. - nfa.add_fail().unwrap(); - nfa - } - - /// Return the number of states in this NFA. - /// - /// This is guaranteed to be no bigger than [`StateID::LIMIT`]. - #[inline] - pub fn len(&self) -> usize { - self.states.len() - } - - /// Returns the total number of distinct match states in this NFA. - /// Stated differently, this returns the total number of regex patterns - /// used to build this NFA. - /// - /// This may return zero if the NFA was constructed with no patterns. In - /// this case, and only this case, the NFA can never produce a match for - /// any input. - /// - /// This is guaranteed to be no bigger than [`PatternID::LIMIT`]. - #[inline] - pub fn pattern_len(&self) -> usize { - self.start_pattern.len() - } - - /// Returns the pattern ID of the pattern currently being compiled by this - /// NFA. - fn current_pattern_id(&self) -> PatternID { - // This always works because we never permit more patterns in - // 'start_pattern' than can be addressed by PatternID. Also, we only - // add a new entry to 'start_pattern' once we finish compiling a - // pattern. Thus, the length refers to the ID of the current pattern - // being compiled. - PatternID::new(self.start_pattern.len()).unwrap() - } - - /// Returns the total number of capturing groups in this NFA. - /// - /// This includes the special 0th capture group that is always present and - /// captures the start and end offset of the entire match. - /// - /// This is a convenience routine for `nfa.capture_slot_len() / 2`. - #[inline] - pub fn capture_len(&self) -> usize { - let slots = self.capture_slot_len(); - // This assert is guaranteed to pass since the NFA construction process - // guarantees that it is always true. - assert_eq!(slots % 2, 0, "capture slots must be divisible by 2"); - slots / 2 - } - - /// Returns the total number of capturing slots in this NFA. - /// - /// This value is guaranteed to be a multiple of 2. (Where each capturing - /// group has precisely two capturing slots in the NFA.) - #[inline] - pub fn capture_slot_len(&self) -> usize { - self.patterns_to_slots.last().map_or(0, |r| r.end) - } - - /// Return a range of capture slots for the given pattern. - /// - /// The range returned is guaranteed to be contiguous with ranges for - /// adjacent patterns. - /// - /// This panics if the given pattern ID is greater than or equal to the - /// number of patterns in this NFA. - #[inline] - pub fn pattern_slots(&self, pid: PatternID) -> Range<usize> { - self.patterns_to_slots[pid].clone() - } - - /// Return the capture group index corresponding to the given name in the - /// given pattern. If no such capture group name exists in the given - /// pattern, then this returns `None`. - /// - /// If the given pattern ID is invalid, then this panics. - #[inline] - pub fn capture_name_to_index( - &self, - pid: PatternID, - name: &str, - ) -> Option<usize> { - assert!(pid.as_usize() < self.pattern_len(), "invalid pattern ID"); - self.capture_name_to_index[pid].get(name).cloned() - } - - // TODO: add iterators over capture group names. - // Do we also permit indexing? - - /// Returns an iterator over all pattern IDs in this NFA. - #[inline] - pub fn patterns(&self) -> PatternIter { - PatternIter { - it: PatternID::iter(self.pattern_len()), - _marker: core::marker::PhantomData, - } - } - - /// Return the ID of the initial anchored state of this NFA. - #[inline] - pub fn start_anchored(&self) -> StateID { - self.start_anchored - } - - /// Set the anchored starting state ID for this NFA. - #[inline] - pub fn set_start_anchored(&mut self, id: StateID) { - self.start_anchored = id; - } - - /// Return the ID of the initial unanchored state of this NFA. - #[inline] - pub fn start_unanchored(&self) -> StateID { - self.start_unanchored - } - - /// Set the unanchored starting state ID for this NFA. - #[inline] - pub fn set_start_unanchored(&mut self, id: StateID) { - self.start_unanchored = id; - } - - /// Return the ID of the initial anchored state for the given pattern. - /// - /// If the pattern doesn't exist in this NFA, then this panics. - #[inline] - pub fn start_pattern(&self, pid: PatternID) -> StateID { - self.start_pattern[pid] - } - - /// Get the byte class set for this NFA. - #[inline] - pub fn byte_class_set(&self) -> &ByteClassSet { - &self.byte_class_set - } - - /// Return a reference to the NFA state corresponding to the given ID. - #[inline] - pub fn state(&self, id: StateID) -> &State { - &self.states[id] - } - - /// Returns a slice of all states in this NFA. - /// - /// The slice returned may be indexed by a `StateID` generated by `add`. - #[inline] - pub fn states(&self) -> &[State] { - &self.states - } - - #[inline] - pub fn is_always_start_anchored(&self) -> bool { - self.start_anchored() == self.start_unanchored() - } - - #[inline] - pub fn has_any_look(&self) -> bool { - self.facts.has_any_look() - } - - #[inline] - pub fn has_any_anchor(&self) -> bool { - self.facts.has_any_anchor() - } - - #[inline] - pub fn has_word_boundary(&self) -> bool { - self.has_word_boundary_unicode() || self.has_word_boundary_ascii() - } - - #[inline] - pub fn has_word_boundary_unicode(&self) -> bool { - self.facts.has_word_boundary_unicode() - } - - #[inline] - pub fn has_word_boundary_ascii(&self) -> bool { - self.facts.has_word_boundary_ascii() - } - - /// Returns the memory usage, in bytes, of this NFA. - /// - /// This does **not** include the stack size used up by this NFA. To - /// compute that, use `std::mem::size_of::<NFA>()`. - #[inline] - pub fn memory_usage(&self) -> usize { - self.states.len() * mem::size_of::<State>() - + self.memory_states - + self.start_pattern.len() * mem::size_of::<StateID>() - } - - // Why do we define a bunch of 'add_*' routines below instead of just - // defining a single 'add' routine that accepts a 'State'? Indeed, for most - // of the 'add_*' routines below, such a simple API would be more than - // appropriate. Unfortunately, adding capture states and, to a lesser - // extent, match states, is a bit more complex. Namely, when we add a - // capture state, we *really* want to know the corresponding capture - // group's name and index and what not, so that we can update other state - // inside this NFA. But, e.g., the capture group name is not and should - // not be included in 'State::Capture'. So what are our choices? - // - // 1) Define one 'add' and require some additional optional parameters. - // This feels quite ugly, and adds unnecessary complexity to more common - // and simpler cases. - // - // 2) Do what we do below. The sad thing is that our API is bigger with - // more methods. But each method is very specific and hopefully simple. - // - // 3) Define a new enum, say, 'StateWithInfo', or something that permits - // providing both a State and some extra ancillary info in some cases. This - // doesn't seem too bad to me, but seems slightly worse than (2) because of - // the additional type required. - // - // 4) Abandon the idea that we have to specify things like the capture - // group name when we add the Capture state to the NFA. We would then need - // to add other methods that permit the caller to add this additional state - // "out of band." Other than it introducing some additional complexity, I - // decided against this because I wanted the NFA builder API to make it - // as hard as possible to build a bad or invalid NFA. Using the approach - // below, as you'll see, permits us to do a lot of strict checking of our - // inputs and return an error if we see something we don't expect. - - pub fn add_range(&mut self, range: Transition) -> Result<StateID, Error> { - self.byte_class_set.set_range(range.start, range.end); - self.add_state(State::Range { range }) - } - - pub fn add_sparse( - &mut self, - sparse: SparseTransitions, - ) -> Result<StateID, Error> { - for range in sparse.ranges.iter() { - self.byte_class_set.set_range(range.start, range.end); - } - self.add_state(State::Sparse(sparse)) - } - - pub fn add_look( - &mut self, - next: StateID, - look: Look, - ) -> Result<StateID, Error> { - self.facts.set_has_any_look(true); - look.add_to_byteset(&mut self.byte_class_set); - match look { - Look::StartLine - | Look::EndLine - | Look::StartText - | Look::EndText => { - self.facts.set_has_any_anchor(true); - } - Look::WordBoundaryUnicode | Look::WordBoundaryUnicodeNegate => { - self.facts.set_has_word_boundary_unicode(true); - } - Look::WordBoundaryAscii | Look::WordBoundaryAsciiNegate => { - self.facts.set_has_word_boundary_ascii(true); - } - } - self.add_state(State::Look { look, next }) - } - - pub fn add_union( - &mut self, - alternates: Box<[StateID]>, - ) -> Result<StateID, Error> { - self.add_state(State::Union { alternates }) - } - - pub fn add_capture_start( - &mut self, - next_id: StateID, - capture_index: u32, - name: Option<Arc<str>>, - ) -> Result<StateID, Error> { - let pid = self.current_pattern_id(); - let capture_index = match usize::try_from(capture_index) { - Err(_) => { - return Err(Error::invalid_capture_index(core::usize::MAX)) - } - Ok(capture_index) => capture_index, - }; - // Do arithmetic to find our absolute slot index first, to make sure - // the index is at least possibly valid (doesn't overflow). - let relative_slot = match capture_index.checked_mul(2) { - Some(relative_slot) => relative_slot, - None => return Err(Error::invalid_capture_index(capture_index)), - }; - let slot = match relative_slot.checked_add(self.capture_slot_len()) { - Some(slot) => slot, - None => return Err(Error::invalid_capture_index(capture_index)), - }; - // Make sure we have space to insert our (pid,index)|-->name mapping. - if pid.as_usize() >= self.capture_index_to_name.len() { - // Note that we require that if you're adding capturing groups, - // then there must be at least one capturing group per pattern. - // Moreover, whenever we expand our space here, it should always - // first be for the first capture group (at index==0). - if pid.as_usize() > self.capture_index_to_name.len() - || capture_index > 0 - { - return Err(Error::invalid_capture_index(capture_index)); - } - self.capture_name_to_index.push(CaptureNameMap::new()); - self.capture_index_to_name.push(vec![]); - } - if capture_index >= self.capture_index_to_name[pid].len() { - // We require that capturing groups are added in correspondence - // to their index. So no discontinuous indices. This is likely - // overly strict, but also makes it simpler to provide guarantees - // about our capturing group data. - if capture_index > self.capture_index_to_name[pid].len() { - return Err(Error::invalid_capture_index(capture_index)); - } - self.capture_index_to_name[pid].push(None); - } - if let Some(ref name) = name { - self.capture_name_to_index[pid] - .insert(Arc::clone(name), capture_index); - } - self.capture_index_to_name[pid][capture_index] = name; - self.add_state(State::Capture { next: next_id, slot }) - } - - pub fn add_capture_end( - &mut self, - next_id: StateID, - capture_index: u32, - ) -> Result<StateID, Error> { - let pid = self.current_pattern_id(); - let capture_index = match usize::try_from(capture_index) { - Err(_) => { - return Err(Error::invalid_capture_index(core::usize::MAX)) - } - Ok(capture_index) => capture_index, - }; - // If we haven't already added this capture group via a corresponding - // 'add_capture_start' call, then we consider the index given to be - // invalid. - if pid.as_usize() >= self.capture_index_to_name.len() - || capture_index >= self.capture_index_to_name[pid].len() - { - return Err(Error::invalid_capture_index(capture_index)); - } - // Since we've already confirmed that this capture index is invalid - // and has a corresponding starting slot, we know the multiplcation - // has already been done and succeeded. - let relative_slot_start = capture_index.checked_mul(2).unwrap(); - let relative_slot = match relative_slot_start.checked_add(1) { - Some(relative_slot) => relative_slot, - None => return Err(Error::invalid_capture_index(capture_index)), - }; - let slot = match relative_slot.checked_add(self.capture_slot_len()) { - Some(slot) => slot, - None => return Err(Error::invalid_capture_index(capture_index)), - }; - self.add_state(State::Capture { next: next_id, slot }) - } - - pub fn add_fail(&mut self) -> Result<StateID, Error> { - self.add_state(State::Fail) - } - - /// Add a new match state to this NFA and return its state ID. - pub fn add_match(&mut self) -> Result<StateID, Error> { - let pattern_id = self.current_pattern_id(); - let sid = self.add_state(State::Match { id: pattern_id })?; - Ok(sid) - } - - /// Finish compiling the current pattern and return its identifier. The - /// given ID should be the state ID corresponding to the anchored starting - /// state for matching this pattern. - pub fn finish_pattern( - &mut self, - start_id: StateID, - ) -> Result<PatternID, Error> { - // We've gotta make sure that we never permit the user to add more - // patterns than we can identify. So if we're already at the limit, - // then return an error. This is somewhat non-ideal since this won't - // result in an error until trying to complete the compilation of a - // pattern instead of starting it. - if self.start_pattern.len() >= PatternID::LIMIT { - return Err(Error::too_many_patterns( - self.start_pattern.len().saturating_add(1), - )); - } - let pid = self.current_pattern_id(); - self.start_pattern.push(start_id); - // Add the number of new slots created by this pattern. This is always - // equivalent to '2 * caps.len()', where 'caps.len()' is the number of - // new capturing groups introduced by the pattern we're finishing. - let new_cap_groups = self - .capture_index_to_name - .get(pid.as_usize()) - .map_or(0, |caps| caps.len()); - let new_slots = match new_cap_groups.checked_mul(2) { - Some(new_slots) => new_slots, - None => { - // Just return the biggest index that we know exists. - let index = new_cap_groups.saturating_sub(1); - return Err(Error::invalid_capture_index(index)); - } - }; - let slot_start = self.capture_slot_len(); - self.patterns_to_slots.push(slot_start..(slot_start + new_slots)); - Ok(pid) - } - - fn add_state(&mut self, state: State) -> Result<StateID, Error> { - let id = StateID::new(self.states.len()) - .map_err(|_| Error::too_many_states(self.states.len()))?; - self.memory_states += state.memory_usage(); - self.states.push(state); - Ok(id) - } - - /// Remap the transitions in every state of this NFA using the given map. - /// The given map should be indexed according to state ID namespace used by - /// the transitions of the states currently in this NFA. - /// - /// This may be used during the final phases of an NFA compiler, which - /// turns its intermediate NFA into the final NFA. Remapping may be - /// required to bring the state pointers from the intermediate NFA to the - /// final NFA. - pub fn remap(&mut self, old_to_new: &[StateID]) { - for state in &mut self.states { - state.remap(old_to_new); - } - self.start_anchored = old_to_new[self.start_anchored]; - self.start_unanchored = old_to_new[self.start_unanchored]; - for (pid, id) in self.start_pattern.iter_mut().with_pattern_ids() { - *id = old_to_new[*id]; - } - } - - /// Clear this NFA such that it has zero states and is otherwise "empty." - /// - /// An empty NFA is useful as a starting point for building one. It is - /// itself not intended to be used for matching. For example, its starting - /// state identifiers are configured to be `0`, but since it has no states, - /// the identifiers are invalid. - pub fn clear(&mut self) { - self.states.clear(); - self.start_anchored = StateID::ZERO; - self.start_unanchored = StateID::ZERO; - self.start_pattern.clear(); - self.patterns_to_slots.clear(); - self.capture_name_to_index.clear(); - self.capture_index_to_name.clear(); - self.byte_class_set = ByteClassSet::empty(); - self.facts = Facts::default(); - self.memory_states = 0; - } -} - -impl fmt::Debug for NFA { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "thompson::NFA(")?; - for (sid, state) in self.states.iter().with_state_ids() { - let status = if sid == self.start_anchored { - '^' - } else if sid == self.start_unanchored { - '>' - } else { - ' ' - }; - writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; - } - if self.pattern_len() > 1 { - writeln!(f, "")?; - for pid in self.patterns() { - let sid = self.start_pattern(pid); - writeln!( - f, - "START({:06?}): {:?}", - pid.as_usize(), - sid.as_usize() - )?; - } - } - writeln!(f, "")?; - writeln!( - f, - "transition equivalence classes: {:?}", - self.byte_class_set().byte_classes() - )?; - writeln!(f, ")")?; - Ok(()) - } -} - -/// A state in a final compiled NFA. -#[derive(Clone, Eq, PartialEq)] -pub enum State { - /// A state that transitions to `next` if and only if the current input - /// byte is in the range `[start, end]` (inclusive). - /// - /// This is a special case of Sparse in that it encodes only one transition - /// (and therefore avoids the allocation). - Range { range: Transition }, - /// A state with possibly many transitions, represented in a sparse - /// fashion. Transitions are ordered lexicographically by input range. As - /// such, this may only be used when every transition has equal priority. - /// (In practice, this is only used for encoding UTF-8 automata.) - Sparse(SparseTransitions), - /// A conditional epsilon transition satisfied via some sort of - /// look-around. - Look { look: Look, next: StateID }, - /// An alternation such that there exists an epsilon transition to all - /// states in `alternates`, where matches found via earlier transitions - /// are preferred over later transitions. - Union { alternates: Box<[StateID]> }, - /// An empty state that records a capture location. - /// - /// From the perspective of finite automata, this is precisely equivalent - /// to an epsilon transition, but serves the purpose of instructing NFA - /// simulations to record additional state when the finite state machine - /// passes through this epsilon transition. - /// - /// These transitions are treated as epsilon transitions with no additional - /// effects in DFAs. - /// - /// 'slot' in this context refers to the specific capture group offset that - /// is being recorded. Each capturing group has two slots corresponding to - /// the start and end of the matching portion of that group. - /// A fail state. When encountered, the automaton is guaranteed to never - /// reach a match state. - Capture { next: StateID, slot: usize }, - /// A state that cannot be transitioned out of. If a search reaches this - /// state, then no match is possible and the search should terminate. - Fail, - /// A match state. There is exactly one such occurrence of this state for - /// each regex compiled into the NFA. - Match { id: PatternID }, -} - -impl State { - /// Returns true if and only if this state contains one or more epsilon - /// transitions. - #[inline] - pub fn is_epsilon(&self) -> bool { - match *self { - State::Range { .. } - | State::Sparse { .. } - | State::Fail - | State::Match { .. } => false, - State::Look { .. } - | State::Union { .. } - | State::Capture { .. } => true, - } - } - - /// Returns the heap memory usage of this NFA state in bytes. - fn memory_usage(&self) -> usize { - match *self { - State::Range { .. } - | State::Look { .. } - | State::Capture { .. } - | State::Match { .. } - | State::Fail => 0, - State::Sparse(SparseTransitions { ref ranges }) => { - ranges.len() * mem::size_of::<Transition>() - } - State::Union { ref alternates } => { - alternates.len() * mem::size_of::<StateID>() - } - } - } - - /// Remap the transitions in this state using the given map. Namely, the - /// given map should be indexed according to the transitions currently - /// in this state. - /// - /// This is used during the final phase of the NFA compiler, which turns - /// its intermediate NFA into the final NFA. - fn remap(&mut self, remap: &[StateID]) { - match *self { - State::Range { ref mut range } => range.next = remap[range.next], - State::Sparse(SparseTransitions { ref mut ranges }) => { - for r in ranges.iter_mut() { - r.next = remap[r.next]; - } - } - State::Look { ref mut next, .. } => *next = remap[*next], - State::Union { ref mut alternates } => { - for alt in alternates.iter_mut() { - *alt = remap[*alt]; - } - } - State::Capture { ref mut next, .. } => *next = remap[*next], - State::Fail => {} - State::Match { .. } => {} - } - } -} - -impl fmt::Debug for State { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match *self { - State::Range { ref range } => range.fmt(f), - State::Sparse(SparseTransitions { ref ranges }) => { - let rs = ranges - .iter() - .map(|t| format!("{:?}", t)) - .collect::<Vec<String>>() - .join(", "); - write!(f, "sparse({})", rs) - } - State::Look { ref look, next } => { - write!(f, "{:?} => {:?}", look, next.as_usize()) - } - State::Union { ref alternates } => { - let alts = alternates - .iter() - .map(|id| format!("{:?}", id.as_usize())) - .collect::<Vec<String>>() - .join(", "); - write!(f, "alt({})", alts) - } - State::Capture { next, slot } => { - write!(f, "capture({:?}) => {:?}", slot, next.as_usize()) - } - State::Fail => write!(f, "FAIL"), - State::Match { id } => write!(f, "MATCH({:?})", id.as_usize()), - } - } -} - -/// A collection of facts about an NFA. -/// -/// There are no real cohesive principles behind what gets put in here. For -/// the most part, it is implementation driven. -#[derive(Clone, Copy, Debug, Default)] -struct Facts { - /// Various yes/no facts about this NFA. - bools: u16, -} - -impl Facts { - define_bool!(0, has_any_look, set_has_any_look); - define_bool!(1, has_any_anchor, set_has_any_anchor); - define_bool!(2, has_word_boundary_unicode, set_has_word_boundary_unicode); - define_bool!(3, has_word_boundary_ascii, set_has_word_boundary_ascii); -} - -/// A sequence of transitions used to represent a sparse state. -#[derive(Clone, Debug, Eq, PartialEq)] -pub struct SparseTransitions { - pub ranges: Box<[Transition]>, -} - -impl SparseTransitions { - pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { - haystack.get(at).and_then(|&b| self.matches_byte(b)) - } - - pub fn matches_unit(&self, unit: alphabet::Unit) -> Option<StateID> { - unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) - } - - pub fn matches_byte(&self, byte: u8) -> Option<StateID> { - for t in self.ranges.iter() { - if t.start > byte { - break; - } else if t.matches_byte(byte) { - return Some(t.next); - } - } - None - - /* - // This is an alternative implementation that uses binary search. In - // some ad hoc experiments, like - // - // smallishru=OpenSubtitles2018.raw.sample.smallish.ru - // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' - // - // I could not observe any improvement, and in fact, things seemed to - // be a bit slower. - self.ranges - .binary_search_by(|t| { - if t.end < byte { - core::cmp::Ordering::Less - } else if t.start > byte { - core::cmp::Ordering::Greater - } else { - core::cmp::Ordering::Equal - } - }) - .ok() - .map(|i| self.ranges[i].next) - */ - } -} - -/// A transition to another state, only if the given byte falls in the -/// inclusive range specified. -#[derive(Clone, Copy, Eq, Hash, PartialEq)] -pub struct Transition { - pub start: u8, - pub end: u8, - pub next: StateID, -} - -impl Transition { - pub fn matches(&self, haystack: &[u8], at: usize) -> bool { - haystack.get(at).map_or(false, |&b| self.matches_byte(b)) - } - - pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { - unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) - } - - pub fn matches_byte(&self, byte: u8) -> bool { - self.start <= byte && byte <= self.end - } -} - -impl fmt::Debug for Transition { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use crate::util::DebugByte; - - let Transition { start, end, next } = *self; - if self.start == self.end { - write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) - } else { - write!( - f, - "{:?}-{:?} => {:?}", - DebugByte(start), - DebugByte(end), - next.as_usize(), - ) - } - } -} - -/// A conditional NFA epsilon transition. -/// -/// A simulation of the NFA can only move through this epsilon transition if -/// the current position satisfies some look-around property. Some assertions -/// are look-behind (StartLine, StartText), some assertions are look-ahead -/// (EndLine, EndText) while other assertions are both look-behind and -/// look-ahead (WordBoundary*). -#[derive(Clone, Copy, Debug, Eq, PartialEq)] -pub enum Look { - /// The previous position is either `\n` or the current position is the - /// beginning of the haystack (i.e., at position `0`). - StartLine = 1 << 0, - /// The next position is either `\n` or the current position is the end of - /// the haystack (i.e., at position `haystack.len()`). - EndLine = 1 << 1, - /// The current position is the beginning of the haystack (i.e., at - /// position `0`). - StartText = 1 << 2, - /// The current position is the end of the haystack (i.e., at position - /// `haystack.len()`). - EndText = 1 << 3, - /// When tested at position `i`, where `p=decode_utf8_rev(&haystack[..i])` - /// and `n=decode_utf8(&haystack[i..])`, this assertion passes if and only - /// if `is_word(p) != is_word(n)`. If `i=0`, then `is_word(p)=false` and if - /// `i=haystack.len()`, then `is_word(n)=false`. - WordBoundaryUnicode = 1 << 4, - /// Same as for `WordBoundaryUnicode`, but requires that - /// `is_word(p) == is_word(n)`. - WordBoundaryUnicodeNegate = 1 << 5, - /// When tested at position `i`, where `p=haystack[i-1]` and - /// `n=haystack[i]`, this assertion passes if and only if `is_word(p) - /// != is_word(n)`. If `i=0`, then `is_word(p)=false` and if - /// `i=haystack.len()`, then `is_word(n)=false`. - WordBoundaryAscii = 1 << 6, - /// Same as for `WordBoundaryAscii`, but requires that - /// `is_word(p) == is_word(n)`. - /// - /// Note that it is possible for this assertion to match at positions that - /// split the UTF-8 encoding of a codepoint. For this reason, this may only - /// be used when UTF-8 mode is disable in the regex syntax. - WordBoundaryAsciiNegate = 1 << 7, -} - -impl Look { - #[inline(always)] - pub fn matches(&self, bytes: &[u8], at: usize) -> bool { - match *self { - Look::StartLine => at == 0 || bytes[at - 1] == b'\n', - Look::EndLine => at == bytes.len() || bytes[at] == b'\n', - Look::StartText => at == 0, - Look::EndText => at == bytes.len(), - Look::WordBoundaryUnicode => { - let word_before = is_word_char_rev(bytes, at); - let word_after = is_word_char_fwd(bytes, at); - word_before != word_after - } - Look::WordBoundaryUnicodeNegate => { - // This is pretty subtle. Why do we need to do UTF-8 decoding - // here? Well... at time of writing, the is_word_char_{fwd,rev} - // routines will only return true if there is a valid UTF-8 - // encoding of a "word" codepoint, and false in every other - // case (including invalid UTF-8). This means that in regions - // of invalid UTF-8 (which might be a subset of valid UTF-8!), - // it would result in \B matching. While this would be - // questionable in the context of truly invalid UTF-8, it is - // *certainly* wrong to report match boundaries that split the - // encoding of a codepoint. So to work around this, we ensure - // that we can decode a codepoint on either side of `at`. If - // either direction fails, then we don't permit \B to match at - // all. - // - // Now, this isn't exactly optimal from a perf perspective. We - // could try and detect this in is_word_char_{fwd,rev}, but - // it's not clear if it's worth it. \B is, after all, rarely - // used. - // - // And in particular, we do *not* have to do this with \b, - // because \b *requires* that at least one side of `at` be a - // "word" codepoint, which in turn implies one side of `at` - // must be valid UTF-8. This in turn implies that \b can never - // split a valid UTF-8 encoding of a codepoint. In the case - // where one side of `at` is truly invalid UTF-8 and the other - // side IS a word codepoint, then we want \b to match since it - // represents a valid UTF-8 boundary. It also makes sense. For - // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. - let word_before = at > 0 - && match decode_last_utf8(&bytes[..at]) { - None | Some(Err(_)) => return false, - Some(Ok(_)) => is_word_char_rev(bytes, at), - }; - let word_after = at < bytes.len() - && match decode_utf8(&bytes[at..]) { - None | Some(Err(_)) => return false, - Some(Ok(_)) => is_word_char_fwd(bytes, at), - }; - word_before == word_after - } - Look::WordBoundaryAscii => { - let word_before = at > 0 && is_word_byte(bytes[at - 1]); - let word_after = at < bytes.len() && is_word_byte(bytes[at]); - word_before != word_after - } - Look::WordBoundaryAsciiNegate => { - let word_before = at > 0 && is_word_byte(bytes[at - 1]); - let word_after = at < bytes.len() && is_word_byte(bytes[at]); - word_before == word_after - } - } - } - - /// Create a look-around assertion from its corresponding integer (as - /// defined in `Look`). If the given integer does not correspond to any - /// assertion, then None is returned. - fn from_int(n: u8) -> Option<Look> { - match n { - 0b0000_0001 => Some(Look::StartLine), - 0b0000_0010 => Some(Look::EndLine), - 0b0000_0100 => Some(Look::StartText), - 0b0000_1000 => Some(Look::EndText), - 0b0001_0000 => Some(Look::WordBoundaryUnicode), - 0b0010_0000 => Some(Look::WordBoundaryUnicodeNegate), - 0b0100_0000 => Some(Look::WordBoundaryAscii), - 0b1000_0000 => Some(Look::WordBoundaryAsciiNegate), - _ => None, - } - } - - /// Flip the look-around assertion to its equivalent for reverse searches. - fn reversed(&self) -> Look { - match *self { - Look::StartLine => Look::EndLine, - Look::EndLine => Look::StartLine, - Look::StartText => Look::EndText, - Look::EndText => Look::StartText, - Look::WordBoundaryUnicode => Look::WordBoundaryUnicode, - Look::WordBoundaryUnicodeNegate => Look::WordBoundaryUnicodeNegate, - Look::WordBoundaryAscii => Look::WordBoundaryAscii, - Look::WordBoundaryAsciiNegate => Look::WordBoundaryAsciiNegate, - } - } - - /// Split up the given byte classes into equivalence classes in a way that - /// is consistent with this look-around assertion. - fn add_to_byteset(&self, set: &mut ByteClassSet) { - match *self { - Look::StartText | Look::EndText => {} - Look::StartLine | Look::EndLine => { - set.set_range(b'\n', b'\n'); - } - Look::WordBoundaryUnicode - | Look::WordBoundaryUnicodeNegate - | Look::WordBoundaryAscii - | Look::WordBoundaryAsciiNegate => { - // We need to mark all ranges of bytes whose pairs result in - // evaluating \b differently. This isn't technically correct - // for Unicode word boundaries, but DFAs can't handle those - // anyway, and thus, the byte classes don't need to either - // since they are themselves only used in DFAs. - let iswb = regex_syntax::is_word_byte; - let mut b1: u16 = 0; - let mut b2: u16; - while b1 <= 255 { - b2 = b1 + 1; - while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) { - b2 += 1; - } - set.set_range(b1 as u8, (b2 - 1) as u8); - b1 = b2; - } - } - } - } -} - -/// LookSet is a memory-efficient set of look-around assertions. Callers may -/// idempotently insert or remove any look-around assertion from a set. -#[repr(transparent)] -#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] -pub(crate) struct LookSet { - set: u8, -} - -impl LookSet { - /// Return a LookSet from its representation. - pub(crate) fn from_repr(repr: u8) -> LookSet { - LookSet { set: repr } - } - - /// Return a mutable LookSet from a mutable pointer to its representation. - pub(crate) fn from_repr_mut(repr: &mut u8) -> &mut LookSet { - // SAFETY: This is safe since a LookSet is repr(transparent) where its - // repr is a u8. - unsafe { core::mem::transmute::<&mut u8, &mut LookSet>(repr) } - } - - /// Return true if and only if this set is empty. - pub(crate) fn is_empty(&self) -> bool { - self.set == 0 - } - - /// Clears this set such that it has no assertions in it. - pub(crate) fn clear(&mut self) { - self.set = 0; - } - - /// Insert the given look-around assertion into this set. If the assertion - /// already exists, then this is a no-op. - pub(crate) fn insert(&mut self, look: Look) { - self.set |= look as u8; - } - - /// Remove the given look-around assertion from this set. If the assertion - /// is not in this set, then this is a no-op. - #[cfg(test)] - pub(crate) fn remove(&mut self, look: Look) { - self.set &= !(look as u8); - } - - /// Return true if and only if the given assertion is in this set. - pub(crate) fn contains(&self, look: Look) -> bool { - (look as u8) & self.set != 0 - } - - /// Subtract the given `other` set from the `self` set and return a new - /// set. - pub(crate) fn subtract(&self, other: LookSet) -> LookSet { - LookSet { set: self.set & !other.set } - } - - /// Return the intersection of the given `other` set with the `self` set - /// and return the resulting set. - pub(crate) fn intersect(&self, other: LookSet) -> LookSet { - LookSet { set: self.set & other.set } - } -} - -impl core::fmt::Debug for LookSet { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - let mut members = vec![]; - for i in 0..8 { - let look = match Look::from_int(1 << i) { - None => continue, - Some(look) => look, - }; - if self.contains(look) { - members.push(look); - } - } - f.debug_tuple("LookSet").field(&members).finish() - } -} - -/// An iterator over all pattern IDs in an NFA. -pub struct PatternIter<'a> { - it: PatternIDIter, - /// We explicitly associate a lifetime with this iterator even though we - /// don't actually borrow anything from the NFA. We do this for backward - /// compatibility purposes. If we ever do need to borrow something from - /// the NFA, then we can and just get rid of this marker without breaking - /// the public API. - _marker: core::marker::PhantomData<&'a ()>, -} - -impl<'a> Iterator for PatternIter<'a> { - type Item = PatternID; - - fn next(&mut self) -> Option<PatternID> { - self.it.next() - } -} - -#[cfg(test)] -mod tests { - use super::*; - // TODO: Replace tests using DFA with NFA matching engine once implemented. - use crate::dfa::{dense, Automaton}; - - #[test] - fn always_match() { - let nfa = NFA::always_match(); - let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); - let find = |input, start, end| { - dfa.find_leftmost_fwd_at(None, None, input, start, end) - .unwrap() - .map(|m| m.offset()) - }; - - assert_eq!(Some(0), find(b"", 0, 0)); - assert_eq!(Some(0), find(b"a", 0, 1)); - assert_eq!(Some(1), find(b"a", 1, 1)); - assert_eq!(Some(0), find(b"ab", 0, 2)); - assert_eq!(Some(1), find(b"ab", 1, 2)); - assert_eq!(Some(2), find(b"ab", 2, 2)); - } - - #[test] - fn never_match() { - let nfa = NFA::never_match(); - let dfa = dense::Builder::new().build_from_nfa(&nfa).unwrap(); - let find = |input, start, end| { - dfa.find_leftmost_fwd_at(None, None, input, start, end) - .unwrap() - .map(|m| m.offset()) - }; - - assert_eq!(None, find(b"", 0, 0)); - assert_eq!(None, find(b"a", 0, 1)); - assert_eq!(None, find(b"a", 1, 1)); - assert_eq!(None, find(b"ab", 0, 2)); - assert_eq!(None, find(b"ab", 1, 2)); - assert_eq!(None, find(b"ab", 2, 2)); - } - - #[test] - fn look_set() { - let mut f = LookSet::default(); - assert!(!f.contains(Look::StartText)); - assert!(!f.contains(Look::EndText)); - assert!(!f.contains(Look::StartLine)); - assert!(!f.contains(Look::EndLine)); - assert!(!f.contains(Look::WordBoundaryUnicode)); - assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); - assert!(!f.contains(Look::WordBoundaryAscii)); - assert!(!f.contains(Look::WordBoundaryAsciiNegate)); - - f.insert(Look::StartText); - assert!(f.contains(Look::StartText)); - f.remove(Look::StartText); - assert!(!f.contains(Look::StartText)); - - f.insert(Look::EndText); - assert!(f.contains(Look::EndText)); - f.remove(Look::EndText); - assert!(!f.contains(Look::EndText)); - - f.insert(Look::StartLine); - assert!(f.contains(Look::StartLine)); - f.remove(Look::StartLine); - assert!(!f.contains(Look::StartLine)); - - f.insert(Look::EndLine); - assert!(f.contains(Look::EndLine)); - f.remove(Look::EndLine); - assert!(!f.contains(Look::EndLine)); - - f.insert(Look::WordBoundaryUnicode); - assert!(f.contains(Look::WordBoundaryUnicode)); - f.remove(Look::WordBoundaryUnicode); - assert!(!f.contains(Look::WordBoundaryUnicode)); - - f.insert(Look::WordBoundaryUnicodeNegate); - assert!(f.contains(Look::WordBoundaryUnicodeNegate)); - f.remove(Look::WordBoundaryUnicodeNegate); - assert!(!f.contains(Look::WordBoundaryUnicodeNegate)); - - f.insert(Look::WordBoundaryAscii); - assert!(f.contains(Look::WordBoundaryAscii)); - f.remove(Look::WordBoundaryAscii); - assert!(!f.contains(Look::WordBoundaryAscii)); - - f.insert(Look::WordBoundaryAsciiNegate); - assert!(f.contains(Look::WordBoundaryAsciiNegate)); - f.remove(Look::WordBoundaryAsciiNegate); - assert!(!f.contains(Look::WordBoundaryAsciiNegate)); - } - - #[test] - fn look_matches_start_line() { - let look = Look::StartLine; - - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("\n"), 0)); - assert!(look.matches(B("\n"), 1)); - assert!(look.matches(B("a"), 0)); - assert!(look.matches(B("\na"), 1)); - - assert!(!look.matches(B("a"), 1)); - assert!(!look.matches(B("a\na"), 1)); - } - - #[test] - fn look_matches_end_line() { - let look = Look::EndLine; - - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("\n"), 1)); - assert!(look.matches(B("\na"), 0)); - assert!(look.matches(B("\na"), 2)); - assert!(look.matches(B("a\na"), 1)); - - assert!(!look.matches(B("a"), 0)); - assert!(!look.matches(B("\na"), 1)); - assert!(!look.matches(B("a\na"), 0)); - assert!(!look.matches(B("a\na"), 2)); - } - - #[test] - fn look_matches_start_text() { - let look = Look::StartText; - - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("\n"), 0)); - assert!(look.matches(B("a"), 0)); - - assert!(!look.matches(B("\n"), 1)); - assert!(!look.matches(B("\na"), 1)); - assert!(!look.matches(B("a"), 1)); - assert!(!look.matches(B("a\na"), 1)); - } - - #[test] - fn look_matches_end_text() { - let look = Look::EndText; - - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("\n"), 1)); - assert!(look.matches(B("\na"), 2)); - - assert!(!look.matches(B("\na"), 0)); - assert!(!look.matches(B("a\na"), 1)); - assert!(!look.matches(B("a"), 0)); - assert!(!look.matches(B("\na"), 1)); - assert!(!look.matches(B("a\na"), 0)); - assert!(!look.matches(B("a\na"), 2)); - } - - #[test] - fn look_matches_word_unicode() { - let look = Look::WordBoundaryUnicode; - - // \xF0\x9D\x9B\x83 = 𝛃 (in \w) - // \xF0\x90\x86\x80 = 𐆀 (not in \w) - - // Simple ASCII word boundaries. - assert!(look.matches(B("a"), 0)); - assert!(look.matches(B("a"), 1)); - assert!(look.matches(B("a "), 1)); - assert!(look.matches(B(" a "), 1)); - assert!(look.matches(B(" a "), 2)); - - // Unicode word boundaries with a non-ASCII codepoint. - assert!(look.matches(B("𝛃"), 0)); - assert!(look.matches(B("𝛃"), 4)); - assert!(look.matches(B("𝛃 "), 4)); - assert!(look.matches(B(" 𝛃 "), 1)); - assert!(look.matches(B(" 𝛃 "), 5)); - - // Unicode word boundaries between non-ASCII codepoints. - assert!(look.matches(B("𝛃𐆀"), 0)); - assert!(look.matches(B("𝛃𐆀"), 4)); - - // Non word boundaries for ASCII. - assert!(!look.matches(B(""), 0)); - assert!(!look.matches(B("ab"), 1)); - assert!(!look.matches(B("a "), 2)); - assert!(!look.matches(B(" a "), 0)); - assert!(!look.matches(B(" a "), 3)); - - // Non word boundaries with a non-ASCII codepoint. - assert!(!look.matches(B("𝛃b"), 4)); - assert!(!look.matches(B("𝛃 "), 5)); - assert!(!look.matches(B(" 𝛃 "), 0)); - assert!(!look.matches(B(" 𝛃 "), 6)); - assert!(!look.matches(B("𝛃"), 1)); - assert!(!look.matches(B("𝛃"), 2)); - assert!(!look.matches(B("𝛃"), 3)); - - // Non word boundaries with non-ASCII codepoints. - assert!(!look.matches(B("𝛃𐆀"), 1)); - assert!(!look.matches(B("𝛃𐆀"), 2)); - assert!(!look.matches(B("𝛃𐆀"), 3)); - assert!(!look.matches(B("𝛃𐆀"), 5)); - assert!(!look.matches(B("𝛃𐆀"), 6)); - assert!(!look.matches(B("𝛃𐆀"), 7)); - assert!(!look.matches(B("𝛃𐆀"), 8)); - } - - #[test] - fn look_matches_word_ascii() { - let look = Look::WordBoundaryAscii; - - // \xF0\x9D\x9B\x83 = 𝛃 (in \w) - // \xF0\x90\x86\x80 = 𐆀 (not in \w) - - // Simple ASCII word boundaries. - assert!(look.matches(B("a"), 0)); - assert!(look.matches(B("a"), 1)); - assert!(look.matches(B("a "), 1)); - assert!(look.matches(B(" a "), 1)); - assert!(look.matches(B(" a "), 2)); - - // Unicode word boundaries with a non-ASCII codepoint. Since this is - // an ASCII word boundary, none of these match. - assert!(!look.matches(B("𝛃"), 0)); - assert!(!look.matches(B("𝛃"), 4)); - assert!(!look.matches(B("𝛃 "), 4)); - assert!(!look.matches(B(" 𝛃 "), 1)); - assert!(!look.matches(B(" 𝛃 "), 5)); - - // Unicode word boundaries between non-ASCII codepoints. Again, since - // this is an ASCII word boundary, none of these match. - assert!(!look.matches(B("𝛃𐆀"), 0)); - assert!(!look.matches(B("𝛃𐆀"), 4)); - - // Non word boundaries for ASCII. - assert!(!look.matches(B(""), 0)); - assert!(!look.matches(B("ab"), 1)); - assert!(!look.matches(B("a "), 2)); - assert!(!look.matches(B(" a "), 0)); - assert!(!look.matches(B(" a "), 3)); - - // Non word boundaries with a non-ASCII codepoint. - assert!(look.matches(B("𝛃b"), 4)); - assert!(!look.matches(B("𝛃 "), 5)); - assert!(!look.matches(B(" 𝛃 "), 0)); - assert!(!look.matches(B(" 𝛃 "), 6)); - assert!(!look.matches(B("𝛃"), 1)); - assert!(!look.matches(B("𝛃"), 2)); - assert!(!look.matches(B("𝛃"), 3)); - - // Non word boundaries with non-ASCII codepoints. - assert!(!look.matches(B("𝛃𐆀"), 1)); - assert!(!look.matches(B("𝛃𐆀"), 2)); - assert!(!look.matches(B("𝛃𐆀"), 3)); - assert!(!look.matches(B("𝛃𐆀"), 5)); - assert!(!look.matches(B("𝛃𐆀"), 6)); - assert!(!look.matches(B("𝛃𐆀"), 7)); - assert!(!look.matches(B("𝛃𐆀"), 8)); - } - - #[test] - fn look_matches_word_unicode_negate() { - let look = Look::WordBoundaryUnicodeNegate; - - // \xF0\x9D\x9B\x83 = 𝛃 (in \w) - // \xF0\x90\x86\x80 = 𐆀 (not in \w) - - // Simple ASCII word boundaries. - assert!(!look.matches(B("a"), 0)); - assert!(!look.matches(B("a"), 1)); - assert!(!look.matches(B("a "), 1)); - assert!(!look.matches(B(" a "), 1)); - assert!(!look.matches(B(" a "), 2)); - - // Unicode word boundaries with a non-ASCII codepoint. - assert!(!look.matches(B("𝛃"), 0)); - assert!(!look.matches(B("𝛃"), 4)); - assert!(!look.matches(B("𝛃 "), 4)); - assert!(!look.matches(B(" 𝛃 "), 1)); - assert!(!look.matches(B(" 𝛃 "), 5)); - - // Unicode word boundaries between non-ASCII codepoints. - assert!(!look.matches(B("𝛃𐆀"), 0)); - assert!(!look.matches(B("𝛃𐆀"), 4)); - - // Non word boundaries for ASCII. - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("ab"), 1)); - assert!(look.matches(B("a "), 2)); - assert!(look.matches(B(" a "), 0)); - assert!(look.matches(B(" a "), 3)); - - // Non word boundaries with a non-ASCII codepoint. - assert!(look.matches(B("𝛃b"), 4)); - assert!(look.matches(B("𝛃 "), 5)); - assert!(look.matches(B(" 𝛃 "), 0)); - assert!(look.matches(B(" 𝛃 "), 6)); - // These don't match because they could otherwise return an offset that - // splits the UTF-8 encoding of a codepoint. - assert!(!look.matches(B("𝛃"), 1)); - assert!(!look.matches(B("𝛃"), 2)); - assert!(!look.matches(B("𝛃"), 3)); - - // Non word boundaries with non-ASCII codepoints. These also don't - // match because they could otherwise return an offset that splits the - // UTF-8 encoding of a codepoint. - assert!(!look.matches(B("𝛃𐆀"), 1)); - assert!(!look.matches(B("𝛃𐆀"), 2)); - assert!(!look.matches(B("𝛃𐆀"), 3)); - assert!(!look.matches(B("𝛃𐆀"), 5)); - assert!(!look.matches(B("𝛃𐆀"), 6)); - assert!(!look.matches(B("𝛃𐆀"), 7)); - // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end - // of the haystack. So the "end" of the haystack isn't a word and 𐆀 - // isn't a word, thus, \B matches. - assert!(look.matches(B("𝛃𐆀"), 8)); - } - - #[test] - fn look_matches_word_ascii_negate() { - let look = Look::WordBoundaryAsciiNegate; - - // \xF0\x9D\x9B\x83 = 𝛃 (in \w) - // \xF0\x90\x86\x80 = 𐆀 (not in \w) - - // Simple ASCII word boundaries. - assert!(!look.matches(B("a"), 0)); - assert!(!look.matches(B("a"), 1)); - assert!(!look.matches(B("a "), 1)); - assert!(!look.matches(B(" a "), 1)); - assert!(!look.matches(B(" a "), 2)); - - // Unicode word boundaries with a non-ASCII codepoint. Since this is - // an ASCII word boundary, none of these match. - assert!(look.matches(B("𝛃"), 0)); - assert!(look.matches(B("𝛃"), 4)); - assert!(look.matches(B("𝛃 "), 4)); - assert!(look.matches(B(" 𝛃 "), 1)); - assert!(look.matches(B(" 𝛃 "), 5)); - - // Unicode word boundaries between non-ASCII codepoints. Again, since - // this is an ASCII word boundary, none of these match. - assert!(look.matches(B("𝛃𐆀"), 0)); - assert!(look.matches(B("𝛃𐆀"), 4)); - - // Non word boundaries for ASCII. - assert!(look.matches(B(""), 0)); - assert!(look.matches(B("ab"), 1)); - assert!(look.matches(B("a "), 2)); - assert!(look.matches(B(" a "), 0)); - assert!(look.matches(B(" a "), 3)); - - // Non word boundaries with a non-ASCII codepoint. - assert!(!look.matches(B("𝛃b"), 4)); - assert!(look.matches(B("𝛃 "), 5)); - assert!(look.matches(B(" 𝛃 "), 0)); - assert!(look.matches(B(" 𝛃 "), 6)); - assert!(look.matches(B("𝛃"), 1)); - assert!(look.matches(B("𝛃"), 2)); - assert!(look.matches(B("𝛃"), 3)); - - // Non word boundaries with non-ASCII codepoints. - assert!(look.matches(B("𝛃𐆀"), 1)); - assert!(look.matches(B("𝛃𐆀"), 2)); - assert!(look.matches(B("𝛃𐆀"), 3)); - assert!(look.matches(B("𝛃𐆀"), 5)); - assert!(look.matches(B("𝛃𐆀"), 6)); - assert!(look.matches(B("𝛃𐆀"), 7)); - assert!(look.matches(B("𝛃𐆀"), 8)); - } - - fn B<'a, T: 'a + ?Sized + AsRef<[u8]>>(string: &'a T) -> &'a [u8] { - string.as_ref() - } -} +pub use self::{ + builder::Builder, + error::BuildError, + nfa::{ + DenseTransitions, PatternIter, SparseTransitions, State, Transition, + NFA, + }, +}; +#[cfg(feature = "syntax")] +pub use compiler::{Compiler, Config, WhichCaptures}; diff --git a/vendor/regex-automata/src/nfa/thompson/nfa.rs b/vendor/regex-automata/src/nfa/thompson/nfa.rs new file mode 100644 index 000000000..2108fa338 --- /dev/null +++ b/vendor/regex-automata/src/nfa/thompson/nfa.rs @@ -0,0 +1,2101 @@ +use core::{fmt, mem}; + +use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; + +#[cfg(feature = "syntax")] +use crate::nfa::thompson::{ + compiler::{Compiler, Config}, + error::BuildError, +}; +use crate::{ + nfa::thompson::builder::Builder, + util::{ + alphabet::{self, ByteClassSet, ByteClasses}, + captures::{GroupInfo, GroupInfoError}, + look::{Look, LookMatcher, LookSet}, + primitives::{ + IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID, + }, + sparse_set::SparseSet, + }, +}; + +/// A byte oriented Thompson non-deterministic finite automaton (NFA). +/// +/// A Thompson NFA is a finite state machine that permits unconditional epsilon +/// transitions, but guarantees that there exists at most one non-epsilon +/// transition for each element in the alphabet for each state. +/// +/// An NFA may be used directly for searching, for analysis or to build +/// a deterministic finite automaton (DFA). +/// +/// # Cheap clones +/// +/// Since an NFA is a core data type in this crate that many other regex +/// engines are based on top of, it is convenient to give ownership of an NFA +/// to said regex engines. Because of this, an NFA uses reference counting +/// internally. Therefore, it is cheap to clone and it is encouraged to do so. +/// +/// # Capabilities +/// +/// Using an NFA for searching via the +/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount +/// of "power" of any regex engine in this crate. Namely, it supports the +/// following in all cases: +/// +/// 1. Detection of a match. +/// 2. Location of a match, including both the start and end offset, in a +/// single pass of the haystack. +/// 3. Location of matching capturing groups. +/// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are +/// present. +/// +/// # Capturing Groups +/// +/// Groups refer to parenthesized expressions inside a regex pattern. They look +/// like this, where `exp` is an arbitrary regex: +/// +/// * `(exp)` - An unnamed capturing group. +/// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group. +/// * `(?:exp)` - A non-capturing group. +/// * `(?i:exp)` - A non-capturing group that sets flags. +/// +/// Only the first two forms are said to be _capturing_. Capturing +/// means that the last position at which they match is reportable. The +/// [`Captures`](crate::util::captures::Captures) type provides convenient +/// access to the match positions of capturing groups, which includes looking +/// up capturing groups by their name. +/// +/// # Byte oriented +/// +/// This NFA is byte oriented, which means that all of its transitions are +/// defined on bytes. In other words, the alphabet of an NFA consists of the +/// 256 different byte values. +/// +/// While DFAs nearly demand that they be byte oriented for performance +/// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed, +/// a previous version of this NFA supported both byte and codepoint oriented +/// modes. A codepoint oriented mode can work because an NFA fundamentally uses +/// a sparse representation of transitions, which works well with the large +/// sparse space of Unicode codepoints. +/// +/// Nevertheless, this NFA is only byte oriented. This choice is primarily +/// driven by implementation simplicity, and also in part memory usage. In +/// practice, performance between the two is roughly comparable. However, +/// building a DFA (including a hybrid DFA) really wants a byte oriented NFA. +/// So if we do have a codepoint oriented NFA, then we also need to generate +/// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only +/// generating byte oriented NFAs, we can produce one less NFA. In other words, +/// if we made our NFA codepoint oriented, we'd need to *also* make it support +/// a byte oriented mode, which is more complicated. But a byte oriented mode +/// can support everything. +/// +/// # Differences with DFAs +/// +/// At the theoretical level, the precise difference between an NFA and a DFA +/// is that, in a DFA, for every state, an input symbol unambiguously refers +/// to a single transition _and_ that an input symbol is required for each +/// transition. At a practical level, this permits DFA implementations to be +/// implemented at their core with a small constant number of CPU instructions +/// for each byte of input searched. In practice, this makes them quite a bit +/// faster than NFAs _in general_. Namely, in order to execute a search for any +/// Thompson NFA, one needs to keep track of a _set_ of states, and execute +/// the possible transitions on all of those states for each input symbol. +/// Overall, this results in much more overhead. To a first approximation, one +/// can expect DFA searches to be about an order of magnitude faster. +/// +/// So why use an NFA at all? The main advantage of an NFA is that it takes +/// linear time (in the size of the pattern string after repetitions have been +/// expanded) to build and linear memory usage. A DFA, on the other hand, may +/// take exponential time and/or space to build. Even in non-pathological +/// cases, DFAs often take quite a bit more memory than their NFA counterparts, +/// _especially_ if large Unicode character classes are involved. Of course, +/// an NFA also provides additional capabilities. For example, it can match +/// Unicode word boundaries on non-ASCII text and resolve the positions of +/// capturing groups. +/// +/// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a +/// good balance between an NFA and a DFA. It avoids the exponential build time +/// of a DFA while maintaining its fast search time. The downside of a hybrid +/// NFA/DFA is that in some cases it can be slower at search time than the NFA. +/// (It also has less functionality than a pure NFA. It cannot handle Unicode +/// word boundaries on non-ASCII text and cannot resolve capturing groups.) +/// +/// # Example +/// +/// This shows how to build an NFA with the default configuration and execute a +/// search using the Pike VM. +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; +/// +/// let re = PikeVM::new(r"foo[0-9]+")?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// +/// let expected = Some(Match::must(0, 0..8)); +/// re.captures(&mut cache, b"foo12345", &mut caps); +/// assert_eq!(expected, caps.get_match()); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: resolving capturing groups +/// +/// This example shows how to parse some simple dates and extract the +/// components of each date via capturing groups. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{ +/// nfa::thompson::pikevm::PikeVM, +/// util::captures::Captures, +/// }; +/// +/// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?; +/// let mut cache = vm.create_cache(); +/// +/// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05"; +/// let all: Vec<Captures> = vm.captures_iter( +/// &mut cache, haystack.as_bytes() +/// ).collect(); +/// // There should be a total of 3 matches. +/// assert_eq!(3, all.len()); +/// // The year from the second match is '2013'. +/// let span = all[1].get_group_by_name("y").unwrap(); +/// assert_eq!("2013", &haystack[span]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// This example shows that only the last match of a capturing group is +/// reported, even if it had to match multiple times for an overall match +/// to occur. +/// +/// ``` +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; +/// +/// let re = PikeVM::new(r"([a-z]){4}")?; +/// let mut cache = re.create_cache(); +/// let mut caps = re.create_captures(); +/// +/// let haystack = b"quux"; +/// re.captures(&mut cache, haystack, &mut caps); +/// assert!(caps.is_match()); +/// assert_eq!(Some(Span::from(3..4)), caps.get_group(1)); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +#[derive(Clone)] +pub struct NFA( + // We make NFAs reference counted primarily for two reasons. First is that + // the NFA type itself is quite large (at least 0.5KB), and so it makes + // sense to put it on the heap by default anyway. Second is that, for Arc + // specifically, this enables cheap clones. This tends to be useful because + // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA) + // all want to hang on to an NFA for use during search time. We could + // provide the NFA at search time via a function argument, but this makes + // for an unnecessarily annoying API. Instead, we just let each structure + // share ownership of the NFA. Using a deep clone would not be smart, since + // the NFA can use quite a bit of heap space. + Arc<Inner>, +); + +impl NFA { + /// Parse the given regular expression using a default configuration and + /// build an NFA from it. + /// + /// If you want a non-default configuration, then use the NFA + /// [`Compiler`] with a [`Config`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new(r"foo[0-9]+")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let expected = Some(Match::must(0, 0..8)); + /// re.captures(&mut cache, b"foo12345", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<NFA, BuildError> { + NFA::compiler().build(pattern) + } + + /// Parse the given regular expressions using a default configuration and + /// build a multi-NFA from them. + /// + /// If you want a non-default configuration, then use the NFA + /// [`Compiler`] with a [`Config`]. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let expected = Some(Match::must(1, 0..3)); + /// re.captures(&mut cache, b"foo12345bar", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> { + NFA::compiler().build_many(patterns) + } + + /// Returns an NFA with a single regex pattern that always matches at every + /// position. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// + /// let re = PikeVM::new_from_nfa(NFA::always_match())?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let expected = Some(Match::must(0, 0..0)); + /// re.captures(&mut cache, b"", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// re.captures(&mut cache, b"foo", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> NFA { + // We could use NFA::new("") here and we'd get the same semantics, but + // hand-assembling the NFA (as below) does the same thing with a fewer + // number of states. It also avoids needing the 'syntax' feature + // enabled. + // + // Technically all we need is the "match" state, but we add the + // "capture" states so that the PikeVM can use this NFA. + // + // The unwraps below are OK because we add so few states that they will + // never exhaust any default limits in any environment. + let mut builder = Builder::new(); + let pid = builder.start_pattern().unwrap(); + assert_eq!(pid.as_usize(), 0); + let start_id = + builder.add_capture_start(StateID::ZERO, 0, None).unwrap(); + let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap(); + let match_id = builder.add_match().unwrap(); + builder.patch(start_id, end_id).unwrap(); + builder.patch(end_id, match_id).unwrap(); + let pid = builder.finish_pattern(start_id).unwrap(); + assert_eq!(pid.as_usize(), 0); + builder.build(start_id, start_id).unwrap() + } + + /// Returns an NFA that never matches at any position. + /// + /// This is a convenience routine for creating an NFA with zero patterns. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; + /// + /// let re = PikeVM::new_from_nfa(NFA::never_match())?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, b"", &mut caps); + /// assert!(!caps.is_match()); + /// re.captures(&mut cache, b"foo", &mut caps); + /// assert!(!caps.is_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> NFA { + // This always succeeds because it only requires one NFA state, which + // will never exhaust any (default) limits. + let mut builder = Builder::new(); + let sid = builder.add_fail().unwrap(); + builder.build(sid, sid).unwrap() + } + + /// Return a default configuration for an `NFA`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of an NFA. + /// + /// # Example + /// + /// This example shows how to build an NFA with a small size limit that + /// results in a compilation error for any regex that tries to use more + /// heap memory than the configured limit. + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; + /// + /// let result = PikeVM::builder() + /// .thompson(NFA::config().nfa_size_limit(Some(1_000))) + /// // Remember, \w is Unicode-aware by default and thus huge. + /// .build(r"\w+"); + /// assert!(result.is_err()); + /// ``` + #[cfg(feature = "syntax")] + pub fn config() -> Config { + Config::new() + } + + /// Return a compiler for configuring the construction of an `NFA`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Compiler`] type in common cases. + /// + /// # Example + /// + /// This example shows how to build an NFA that is permitted match invalid + /// UTF-8. Without the additional syntax configuration here, compilation of + /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = PikeVM::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .build(r"[a-z]+(?-u:.)")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let expected = Some(Match::must(0, 1..5)); + /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn compiler() -> Compiler { + Compiler::new() + } + + /// Returns an iterator over all pattern identifiers in this NFA. + /// + /// Pattern IDs are allocated in sequential order starting from zero, + /// where the order corresponds to the order of patterns provided to the + /// [`NFA::new_many`] constructor. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// let pids: Vec<PatternID> = nfa.patterns().collect(); + /// assert_eq!(pids, vec![ + /// PatternID::must(0), + /// PatternID::must(1), + /// PatternID::must(2), + /// ]); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn patterns(&self) -> PatternIter<'_> { + PatternIter { + it: PatternID::iter(self.pattern_len()), + _marker: core::marker::PhantomData, + } + } + + /// Returns the total number of regex patterns in this NFA. + /// + /// This may return zero if the NFA was constructed with no patterns. In + /// this case, the NFA can never produce a match for any input. + /// + /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because + /// NFA construction will fail if too many patterns are added. + /// + /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(3, nfa.pattern_len()); + /// + /// let nfa = NFA::never_match(); + /// assert_eq!(0, nfa.pattern_len()); + /// + /// let nfa = NFA::always_match(); + /// assert_eq!(1, nfa.pattern_len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn pattern_len(&self) -> usize { + self.0.start_pattern.len() + } + + /// Return the state identifier of the initial anchored state of this NFA. + /// + /// The returned identifier is guaranteed to be a valid index into the + /// slice returned by [`NFA::states`], and is also a valid argument to + /// [`NFA::state`]. + /// + /// # Example + /// + /// This example shows a somewhat contrived example where we can easily + /// predict the anchored starting state. + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; + /// + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build("a")?; + /// let state = nfa.state(nfa.start_anchored()); + /// match *state { + /// State::ByteRange { trans } => { + /// assert_eq!(b'a', trans.start); + /// assert_eq!(b'a', trans.end); + /// } + /// _ => unreachable!("unexpected state"), + /// } + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn start_anchored(&self) -> StateID { + self.0.start_anchored + } + + /// Return the state identifier of the initial unanchored state of this + /// NFA. + /// + /// This is equivalent to the identifier returned by + /// [`NFA::start_anchored`] when the NFA has no unanchored starting state. + /// + /// The returned identifier is guaranteed to be a valid index into the + /// slice returned by [`NFA::states`], and is also a valid argument to + /// [`NFA::state`]. + /// + /// # Example + /// + /// This example shows that the anchored and unanchored starting states + /// are equivalent when an anchored NFA is built. + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new("^a")?; + /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn start_unanchored(&self) -> StateID { + self.0.start_unanchored + } + + /// Return the state identifier of the initial anchored state for the given + /// pattern, or `None` if there is no pattern corresponding to the given + /// identifier. + /// + /// If one uses the starting state for a particular pattern, then the only + /// match that can be returned is for the corresponding pattern. + /// + /// The returned identifier is guaranteed to be a valid index into the + /// slice returned by [`NFA::states`], and is also a valid argument to + /// [`NFA::state`]. + /// + /// # Errors + /// + /// If the pattern doesn't exist in this NFA, then this returns an error. + /// This occurs when `pid.as_usize() >= nfa.pattern_len()`. + /// + /// # Example + /// + /// This example shows that the anchored and unanchored starting states + /// are equivalent when an anchored NFA is built. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new_many(&["^a", "^b"])?; + /// // The anchored and unanchored states for the entire NFA are the same, + /// // since all of the patterns are anchored. + /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); + /// // But the anchored starting states for each pattern are distinct, + /// // because these starting states can only lead to matches for the + /// // corresponding pattern. + /// let anchored = Some(nfa.start_anchored()); + /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0))); + /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1))); + /// // Requesting a pattern not in the NFA will result in None: + /// assert_eq!(None, nfa.start_pattern(PatternID::must(2))); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> { + self.0.start_pattern.get(pid.as_usize()).copied() + } + + /// Get the byte class set for this NFA. + /// + /// A byte class set is a partitioning of this NFA's alphabet into + /// equivalence classes. Any two bytes in the same equivalence class are + /// guaranteed to never discriminate between a match or a non-match. (The + /// partitioning may not be minimal.) + /// + /// Byte classes are used internally by this crate when building DFAs. + /// Namely, among other optimizations, they enable a space optimization + /// where the DFA's internal alphabet is defined over the equivalence + /// classes of bytes instead of all possible byte values. The former is + /// often quite a bit smaller than the latter, which permits the DFA to use + /// less space for its transition table. + #[inline] + pub(crate) fn byte_class_set(&self) -> &ByteClassSet { + &self.0.byte_class_set + } + + /// Get the byte classes for this NFA. + /// + /// Byte classes represent a partitioning of this NFA's alphabet into + /// equivalence classes. Any two bytes in the same equivalence class are + /// guaranteed to never discriminate between a match or a non-match. (The + /// partitioning may not be minimal.) + /// + /// Byte classes are used internally by this crate when building DFAs. + /// Namely, among other optimizations, they enable a space optimization + /// where the DFA's internal alphabet is defined over the equivalence + /// classes of bytes instead of all possible byte values. The former is + /// often quite a bit smaller than the latter, which permits the DFA to use + /// less space for its transition table. + /// + /// # Example + /// + /// This example shows how to query the class of various bytes. + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa = NFA::new("[a-z]+")?; + /// let classes = nfa.byte_classes(); + /// // 'a' and 'z' are in the same class for this regex. + /// assert_eq!(classes.get(b'a'), classes.get(b'z')); + /// // But 'a' and 'A' are not. + /// assert_ne!(classes.get(b'a'), classes.get(b'A')); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn byte_classes(&self) -> &ByteClasses { + &self.0.byte_classes + } + + /// Return a reference to the NFA state corresponding to the given ID. + /// + /// This is a convenience routine for `nfa.states()[id]`. + /// + /// # Panics + /// + /// This panics when the given identifier does not reference a valid state. + /// That is, when `id.as_usize() >= nfa.states().len()`. + /// + /// # Example + /// + /// The anchored state for a pattern will typically correspond to a + /// capturing state for that pattern. (Although, this is not an API + /// guarantee!) + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID}; + /// + /// let nfa = NFA::new("a")?; + /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap()); + /// match *state { + /// State::Capture { slot, .. } => { + /// assert_eq!(0, slot.as_usize()); + /// } + /// _ => unreachable!("unexpected state"), + /// } + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn state(&self, id: StateID) -> &State { + &self.states()[id] + } + + /// Returns a slice of all states in this NFA. + /// + /// The slice returned is indexed by `StateID`. This provides a convenient + /// way to access states while following transitions among those states. + /// + /// # Example + /// + /// This demonstrates that disabling UTF-8 mode can shrink the size of the + /// NFA considerably in some cases, especially when using Unicode character + /// classes. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa_unicode = NFA::new(r"\w")?; + /// let nfa_ascii = NFA::new(r"(?-u)\w")?; + /// // Yes, a factor of 45 difference. No lie. + /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn states(&self) -> &[State] { + &self.0.states + } + + /// Returns the capturing group info for this NFA. + /// + /// The [`GroupInfo`] provides a way to map to and from capture index + /// and capture name for each pattern. It also provides a mapping from + /// each of the capturing groups in every pattern to their corresponding + /// slot offsets encoded in [`State::Capture`] states. + /// + /// Note that `GroupInfo` uses reference counting internally, such that + /// cloning a `GroupInfo` is very cheap. + /// + /// # Example + /// + /// This example shows how to get a list of all capture group names for + /// a particular pattern. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, PatternID}; + /// + /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?; + /// // The first is the implicit group that is always unnammed. The next + /// // 5 groups are the explicit groups found in the concrete syntax above. + /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; + /// let got: Vec<Option<&str>> = + /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); + /// assert_eq!(expected, got); + /// + /// // Using an invalid pattern ID will result in nothing yielded. + /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); + /// assert_eq!(0, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn group_info(&self) -> &GroupInfo { + &self.0.group_info() + } + + /// Returns true if and only if this NFA has at least one + /// [`Capture`](State::Capture) in its sequence of states. + /// + /// This is useful as a way to perform a quick test before attempting + /// something that does or does not require capture states. For example, + /// some regex engines (like the PikeVM) require capture states in order to + /// work at all. + /// + /// # Example + /// + /// This example shows a few different NFAs and whether they have captures + /// or not. + /// + /// ``` + /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; + /// + /// // Obviously has capture states. + /// let nfa = NFA::new("(a)")?; + /// assert!(nfa.has_capture()); + /// + /// // Less obviously has capture states, because every pattern has at + /// // least one anonymous capture group corresponding to the match for the + /// // entire pattern. + /// let nfa = NFA::new("a")?; + /// assert!(nfa.has_capture()); + /// + /// // Other than hand building your own NFA, this is the only way to build + /// // an NFA without capturing groups. In general, you should only do this + /// // if you don't intend to use any of the NFA-oriented regex engines. + /// // Overall, capturing groups don't have many downsides. Although they + /// // can add a bit of noise to simple NFAs, so it can be nice to disable + /// // them for debugging purposes. + /// // + /// // Notice that 'has_capture' is false here even when we have an + /// // explicit capture group in the pattern. + /// let nfa = NFA::compiler() + /// .configure(NFA::config().which_captures(WhichCaptures::None)) + /// .build("(a)")?; + /// assert!(!nfa.has_capture()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn has_capture(&self) -> bool { + self.0.has_capture + } + + /// Returns true if and only if this NFA can match the empty string. + /// When it returns false, all possible matches are guaranteed to have a + /// non-zero length. + /// + /// This is useful as cheap way to know whether code needs to handle the + /// case of a zero length match. This is particularly important when UTF-8 + /// modes are enabled, as when UTF-8 mode is enabled, empty matches that + /// split a codepoint must never be reported. This extra handling can + /// sometimes be costly, and since regexes matching an empty string are + /// somewhat rare, it can be beneficial to treat such regexes specially. + /// + /// # Example + /// + /// This example shows a few different NFAs and whether they match the + /// empty string or not. Notice the empty string isn't merely a matter + /// of a string of length literally `0`, but rather, whether a match can + /// occur between specific pairs of bytes. + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::syntax}; + /// + /// // The empty regex matches the empty string. + /// let nfa = NFA::new("")?; + /// assert!(nfa.has_empty(), "empty matches empty"); + /// // The '+' repetition operator requires at least one match, and so + /// // does not match the empty string. + /// let nfa = NFA::new("a+")?; + /// assert!(!nfa.has_empty(), "+ does not match empty"); + /// // But the '*' repetition operator does. + /// let nfa = NFA::new("a*")?; + /// assert!(nfa.has_empty(), "* does match empty"); + /// // And wrapping '+' in an operator that can match an empty string also + /// // causes it to match the empty string too. + /// let nfa = NFA::new("(a+)*")?; + /// assert!(nfa.has_empty(), "+ inside of * matches empty"); + /// + /// // If a regex is just made of a look-around assertion, even if the + /// // assertion requires some kind of non-empty string around it (such as + /// // \b), then it is still treated as if it matches the empty string. + /// // Namely, if a match occurs of just a look-around assertion, then the + /// // match returned is empty. + /// let nfa = NFA::compiler() + /// .syntax(syntax::Config::new().utf8(false)) + /// .build(r"^$\A\z\b\B(?-u:\b\B)")?; + /// assert!(nfa.has_empty(), "assertions match empty"); + /// // Even when an assertion is wrapped in a '+', it still matches the + /// // empty string. + /// let nfa = NFA::new(r"\b+")?; + /// assert!(nfa.has_empty(), "+ of an assertion matches empty"); + /// + /// // An alternation with even one branch that can match the empty string + /// // is also said to match the empty string overall. + /// let nfa = NFA::new("foo|(bar)?|quux")?; + /// assert!(nfa.has_empty(), "alternations can match empty"); + /// + /// // An NFA that matches nothing does not match the empty string. + /// let nfa = NFA::new("[a&&b]")?; + /// assert!(!nfa.has_empty(), "never matching means not matching empty"); + /// // But if it's wrapped in something that doesn't require a match at + /// // all, then it can match the empty string! + /// let nfa = NFA::new("[a&&b]*")?; + /// assert!(nfa.has_empty(), "* on never-match still matches empty"); + /// // Since a '+' requires a match, using it on something that can never + /// // match will itself produce a regex that can never match anything, + /// // and thus does not match the empty string. + /// let nfa = NFA::new("[a&&b]+")?; + /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing"); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn has_empty(&self) -> bool { + self.0.has_empty + } + + /// Whether UTF-8 mode is enabled for this NFA or not. + /// + /// When UTF-8 mode is enabled, all matches reported by a regex engine + /// derived from this NFA are guaranteed to correspond to spans of valid + /// UTF-8. This includes zero-width matches. For example, the regex engine + /// must guarantee that the empty regex will not match at the positions + /// between code units in the UTF-8 encoding of a single codepoint. + /// + /// See [`Config::utf8`] for more information. + /// + /// This is enabled by default. + /// + /// # Example + /// + /// This example shows how UTF-8 mode can impact the match spans that may + /// be reported in certain cases. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// Match, Input, + /// }; + /// + /// let re = PikeVM::new("")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// // UTF-8 mode is enabled by default. + /// let mut input = Input::new("☃"); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); + /// + /// // Even though an empty regex matches at 1..1, our next match is + /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is + /// // three bytes long). + /// input.set_start(1); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); + /// + /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build("")?; + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); + /// + /// input.set_start(2); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); + /// + /// input.set_start(3); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); + /// + /// input.set_start(4); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(None, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_utf8(&self) -> bool { + self.0.utf8 + } + + /// Returns true when this NFA is meant to be matched in reverse. + /// + /// Generally speaking, when this is true, it means the NFA is supposed to + /// be used in conjunction with moving backwards through the haystack. That + /// is, from a higher memory address to a lower memory address. + /// + /// It is often the case that lower level routines dealing with an NFA + /// don't need to care about whether it is "meant" to be matched in reverse + /// or not. However, there are some specific cases where it matters. For + /// example, the implementation of CRLF-aware `^` and `$` line anchors + /// needs to know whether the search is in the forward or reverse + /// direction. In the forward direction, neither `^` nor `$` should match + /// when a `\r` has been seen previously and a `\n` is next. However, in + /// the reverse direction, neither `^` nor `$` should match when a `\n` + /// has been seen previously and a `\r` is next. This fundamentally changes + /// how the state machine is constructed, and thus needs to be altered + /// based on the direction of the search. + /// + /// This is automatically set when using a [`Compiler`] with a configuration + /// where [`Config::reverse`] is enabled. If you're building your own NFA + /// by hand via a [`Builder`] + #[inline] + pub fn is_reverse(&self) -> bool { + self.0.reverse + } + + /// Returns true if and only if all starting states for this NFA correspond + /// to the beginning of an anchored search. + /// + /// Typically, an NFA will have both an anchored and an unanchored starting + /// state. Namely, because it tends to be useful to have both and the cost + /// of having an unanchored starting state is almost zero (for an NFA). + /// However, if all patterns in the NFA are themselves anchored, then even + /// the unanchored starting state will correspond to an anchored search + /// since the pattern doesn't permit anything else. + /// + /// # Example + /// + /// This example shows a few different scenarios where this method's + /// return value varies. + /// + /// ``` + /// use regex_automata::nfa::thompson::NFA; + /// + /// // The unanchored starting state permits matching this pattern anywhere + /// // in a haystack, instead of just at the beginning. + /// let nfa = NFA::new("a")?; + /// assert!(!nfa.is_always_start_anchored()); + /// + /// // In this case, the pattern is itself anchored, so there is no way + /// // to run an unanchored search. + /// let nfa = NFA::new("^a")?; + /// assert!(nfa.is_always_start_anchored()); + /// + /// // When multiline mode is enabled, '^' can match at the start of a line + /// // in addition to the start of a haystack, so an unanchored search is + /// // actually possible. + /// let nfa = NFA::new("(?m)^a")?; + /// assert!(!nfa.is_always_start_anchored()); + /// + /// // Weird cases also work. A pattern is only considered anchored if all + /// // matches may only occur at the start of a haystack. + /// let nfa = NFA::new("(^a)|a")?; + /// assert!(!nfa.is_always_start_anchored()); + /// + /// // When multiple patterns are present, if they are all anchored, then + /// // the NFA is always anchored too. + /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?; + /// assert!(nfa.is_always_start_anchored()); + /// + /// // But if one pattern is unanchored, then the NFA must permit an + /// // unanchored search. + /// let nfa = NFA::new_many(&["^a", "b", "^c"])?; + /// assert!(!nfa.is_always_start_anchored()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_always_start_anchored(&self) -> bool { + self.start_anchored() == self.start_unanchored() + } + + /// Returns the look-around matcher associated with this NFA. + /// + /// A look-around matcher determines how to match look-around assertions. + /// In particular, some assertions are configurable. For example, the + /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed + /// from the default of `\n` to any other byte. + /// + /// If the NFA was built using a [`Compiler`], then this matcher + /// can be set via the [`Config::look_matcher`] configuration + /// knob. Otherwise, if you've built an NFA by hand, it is set via + /// [`Builder::set_look_matcher`]. + /// + /// # Example + /// + /// This shows how to change the line terminator for multi-line assertions. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// util::look::LookMatcher, + /// Match, Input, + /// }; + /// + /// let mut lookm = LookMatcher::new(); + /// lookm.set_line_terminator(b'\x00'); + /// + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().look_matcher(lookm)) + /// .build(r"(?m)^[a-z]+$")?; + /// let mut cache = re.create_cache(); + /// + /// // Multi-line assertions now use NUL as a terminator. + /// assert_eq!( + /// Some(Match::must(0, 1..4)), + /// re.find(&mut cache, b"\x00abc\x00"), + /// ); + /// // ... and \n is no longer recognized as a terminator. + /// assert_eq!( + /// None, + /// re.find(&mut cache, b"\nabc\n"), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn look_matcher(&self) -> &LookMatcher { + &self.0.look_matcher + } + + /// Returns the union of all look-around assertions used throughout this + /// NFA. When the returned set is empty, it implies that the NFA has no + /// look-around assertions and thus zero conditional epsilon transitions. + /// + /// This is useful in some cases enabling optimizations. It is not + /// unusual, for example, for optimizations to be of the form, "for any + /// regex with zero conditional epsilon transitions, do ..." where "..." + /// is some kind of optimization. + /// + /// This isn't only helpful for optimizations either. Sometimes look-around + /// assertions are difficult to support. For example, many of the DFAs in + /// this crate don't support Unicode word boundaries or handle them using + /// heuristics. Handling that correctly typically requires some kind of + /// cheap check of whether the NFA has a Unicode word boundary in the first + /// place. + /// + /// # Example + /// + /// This example shows how this routine varies based on the regex pattern: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; + /// + /// // No look-around at all. + /// let nfa = NFA::new("a")?; + /// assert!(nfa.look_set_any().is_empty()); + /// + /// // When multiple patterns are present, since this returns the union, + /// // it will include look-around assertions that only appear in one + /// // pattern. + /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?; + /// assert!(nfa.look_set_any().contains(Look::Start)); + /// + /// // Some groups of assertions have various shortcuts. For example: + /// let nfa = NFA::new(r"(?-u:\b)")?; + /// assert!(nfa.look_set_any().contains_word()); + /// assert!(!nfa.look_set_any().contains_word_unicode()); + /// assert!(nfa.look_set_any().contains_word_ascii()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn look_set_any(&self) -> LookSet { + self.0.look_set_any + } + + /// Returns the union of all prefix look-around assertions for every + /// pattern in this NFA. When the returned set is empty, it implies none of + /// the patterns require moving through a conditional epsilon transition + /// before inspecting the first byte in the haystack. + /// + /// This can be useful for determining what kinds of assertions need to be + /// satisfied at the beginning of a search. For example, typically DFAs + /// in this crate will build a distinct starting state for each possible + /// starting configuration that might result in look-around assertions + /// being satisfied differently. However, if the set returned here is + /// empty, then you know that the start state is invariant because there + /// are no conditional epsilon transitions to consider. + /// + /// # Example + /// + /// This example shows how this routine varies based on the regex pattern: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; + /// + /// // No look-around at all. + /// let nfa = NFA::new("a")?; + /// assert!(nfa.look_set_prefix_any().is_empty()); + /// + /// // When multiple patterns are present, since this returns the union, + /// // it will include look-around assertions that only appear in one + /// // pattern. But it will only include assertions that are in the prefix + /// // of a pattern. For example, this includes '^' but not '$' even though + /// // '$' does appear. + /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?; + /// assert!(nfa.look_set_prefix_any().contains(Look::Start)); + /// assert!(!nfa.look_set_prefix_any().contains(Look::End)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn look_set_prefix_any(&self) -> LookSet { + self.0.look_set_prefix_any + } + + // FIXME: The `look_set_prefix_all` computation was not correct, and it + // seemed a little tricky to fix it. Since I wasn't actually using it for + // anything, I just decided to remove it in the run up to the regex 1.9 + // release. If you need this, please file an issue. + /* + /// Returns the intersection of all prefix look-around assertions for every + /// pattern in this NFA. When the returned set is empty, it implies at + /// least one of the patterns does not require moving through a conditional + /// epsilon transition before inspecting the first byte in the haystack. + /// Conversely, when the set contains an assertion, it implies that every + /// pattern in the NFA also contains that assertion in its prefix. + /// + /// This can be useful for determining what kinds of assertions need to be + /// satisfied at the beginning of a search. For example, if you know that + /// [`Look::Start`] is in the prefix intersection set returned here, then + /// you know that all searches, regardless of input configuration, will be + /// anchored. + /// + /// # Example + /// + /// This example shows how this routine varies based on the regex pattern: + /// + /// ``` + /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; + /// + /// // No look-around at all. + /// let nfa = NFA::new("a")?; + /// assert!(nfa.look_set_prefix_all().is_empty()); + /// + /// // When multiple patterns are present, since this returns the + /// // intersection, it will only include assertions present in every + /// // prefix, and only the prefix. + /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?; + /// assert!(nfa.look_set_prefix_all().contains(Look::Start)); + /// assert!(!nfa.look_set_prefix_all().contains(Look::End)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn look_set_prefix_all(&self) -> LookSet { + self.0.look_set_prefix_all + } + */ + + /// Returns the memory usage, in bytes, of this NFA. + /// + /// This does **not** include the stack size used up by this NFA. To + /// compute that, use `std::mem::size_of::<NFA>()`. + /// + /// # Example + /// + /// This example shows that large Unicode character classes can use quite + /// a bit of memory. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::nfa::thompson::NFA; + /// + /// let nfa_unicode = NFA::new(r"\w")?; + /// let nfa_ascii = NFA::new(r"(?-u:\w)")?; + /// + /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn memory_usage(&self) -> usize { + use core::mem::size_of; + + size_of::<Inner>() // allocated on the heap via Arc + + self.0.states.len() * size_of::<State>() + + self.0.start_pattern.len() * size_of::<StateID>() + + self.0.group_info.memory_usage() + + self.0.memory_extra + } +} + +impl fmt::Debug for NFA { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +/// The "inner" part of the NFA. We split this part out so that we can easily +/// wrap it in an `Arc` above in the definition of `NFA`. +/// +/// See builder.rs for the code that actually builds this type. This module +/// does provide (internal) mutable methods for adding things to this +/// NFA before finalizing it, but the high level construction process is +/// controlled by the builder abstraction. (Which is complicated enough to +/// get its own module.) +#[derive(Default)] +pub(super) struct Inner { + /// The state sequence. This sequence is guaranteed to be indexable by all + /// starting state IDs, and it is also guaranteed to contain at most one + /// `Match` state for each pattern compiled into this NFA. (A pattern may + /// not have a corresponding `Match` state if a `Match` state is impossible + /// to reach.) + states: Vec<State>, + /// The anchored starting state of this NFA. + start_anchored: StateID, + /// The unanchored starting state of this NFA. + start_unanchored: StateID, + /// The starting states for each individual pattern. Starting at any + /// of these states will result in only an anchored search for the + /// corresponding pattern. The vec is indexed by pattern ID. When the NFA + /// contains a single regex, then `start_pattern[0]` and `start_anchored` + /// are always equivalent. + start_pattern: Vec<StateID>, + /// Info about the capturing groups in this NFA. This is responsible for + /// mapping groups to slots, mapping groups to names and names to groups. + group_info: GroupInfo, + /// A representation of equivalence classes over the transitions in this + /// NFA. Two bytes in the same equivalence class must not discriminate + /// between a match or a non-match. This map can be used to shrink the + /// total size of a DFA's transition table with a small match-time cost. + /// + /// Note that the NFA's transitions are *not* defined in terms of these + /// equivalence classes. The NFA's transitions are defined on the original + /// byte values. For the most part, this is because they wouldn't really + /// help the NFA much since the NFA already uses a sparse representation + /// to represent transitions. Byte classes are most effective in a dense + /// representation. + byte_class_set: ByteClassSet, + /// This is generated from `byte_class_set`, and essentially represents the + /// same thing but supports different access patterns. Namely, this permits + /// looking up the equivalence class of a byte very cheaply. + /// + /// Ideally we would just store this, but because of annoying code + /// structure reasons, we keep both this and `byte_class_set` around for + /// now. I think I would prefer that `byte_class_set` were computed in the + /// `Builder`, but right now, we compute it as states are added to the + /// `NFA`. + byte_classes: ByteClasses, + /// Whether this NFA has a `Capture` state anywhere. + has_capture: bool, + /// When the empty string is in the language matched by this NFA. + has_empty: bool, + /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that + /// all non-empty matches produced by this NFA correspond to spans of valid + /// UTF-8, and any empty matches produced by this NFA that split a UTF-8 + /// encoded codepoint should be filtered out by the corresponding regex + /// engine. + utf8: bool, + /// Whether this NFA is meant to be matched in reverse or not. + reverse: bool, + /// The matcher to be used for look-around assertions. + look_matcher: LookMatcher, + /// The union of all look-around assertions that occur anywhere within + /// this NFA. If this set is empty, then it means there are precisely zero + /// conditional epsilon transitions in the NFA. + look_set_any: LookSet, + /// The union of all look-around assertions that occur as a zero-length + /// prefix for any of the patterns in this NFA. + look_set_prefix_any: LookSet, + /* + /// The intersection of all look-around assertions that occur as a + /// zero-length prefix for any of the patterns in this NFA. + look_set_prefix_all: LookSet, + */ + /// Heap memory used indirectly by NFA states and other things (like the + /// various capturing group representations above). Since each state + /// might use a different amount of heap, we need to keep track of this + /// incrementally. + memory_extra: usize, +} + +impl Inner { + /// Runs any last finalization bits and turns this into a full NFA. + pub(super) fn into_nfa(mut self) -> NFA { + self.byte_classes = self.byte_class_set.byte_classes(); + // Do epsilon closure from the start state of every pattern in order + // to compute various properties such as look-around assertions and + // whether the empty string can be matched. + let mut stack = vec![]; + let mut seen = SparseSet::new(self.states.len()); + for &start_id in self.start_pattern.iter() { + stack.push(start_id); + seen.clear(); + // let mut prefix_all = LookSet::full(); + let mut prefix_any = LookSet::empty(); + while let Some(sid) = stack.pop() { + if !seen.insert(sid) { + continue; + } + match self.states[sid] { + State::ByteRange { .. } + | State::Dense { .. } + | State::Fail => continue, + State::Sparse(_) => { + // This snippet below will rewrite this sparse state + // as a dense state. By doing it here, we apply this + // optimization to all hot "sparse" states since these + // are the states that are reachable from the start + // state via an epsilon closure. + // + // Unfortunately, this optimization did not seem to + // help much in some very limited ad hoc benchmarking. + // + // I left the 'Dense' state type in place in case we + // want to revisit this, but I suspect the real way + // to make forward progress is a more fundamental + // rearchitecting of how data in the NFA is laid out. + // I think we should consider a single contiguous + // allocation instead of all this indirection and + // potential heap allocations for every state. But this + // is a large re-design and will require API breaking + // changes. + // self.memory_extra -= self.states[sid].memory_usage(); + // let trans = DenseTransitions::from_sparse(sparse); + // self.states[sid] = State::Dense(trans); + // self.memory_extra += self.states[sid].memory_usage(); + continue; + } + State::Match { .. } => self.has_empty = true, + State::Look { look, next } => { + prefix_any = prefix_any.insert(look); + stack.push(next); + } + State::Union { ref alternates } => { + // Order doesn't matter here, since we're just dealing + // with look-around sets. But if we do richer analysis + // here that needs to care about preference order, then + // this should be done in reverse. + stack.extend(alternates.iter()); + } + State::BinaryUnion { alt1, alt2 } => { + stack.push(alt2); + stack.push(alt1); + } + State::Capture { next, .. } => { + stack.push(next); + } + } + } + self.look_set_prefix_any = + self.look_set_prefix_any.union(prefix_any); + } + NFA(Arc::new(self)) + } + + /// Returns the capturing group info for this NFA. + pub(super) fn group_info(&self) -> &GroupInfo { + &self.group_info + } + + /// Add the given state to this NFA after allocating a fresh identifier for + /// it. + /// + /// This panics if too many states are added such that a fresh identifier + /// could not be created. (Currently, the only caller of this routine is + /// a `Builder`, and it upholds this invariant.) + pub(super) fn add(&mut self, state: State) -> StateID { + match state { + State::ByteRange { ref trans } => { + self.byte_class_set.set_range(trans.start, trans.end); + } + State::Sparse(ref sparse) => { + for trans in sparse.transitions.iter() { + self.byte_class_set.set_range(trans.start, trans.end); + } + } + State::Dense { .. } => unreachable!(), + State::Look { look, .. } => { + self.look_matcher + .add_to_byteset(look, &mut self.byte_class_set); + self.look_set_any = self.look_set_any.insert(look); + } + State::Capture { .. } => { + self.has_capture = true; + } + State::Union { .. } + | State::BinaryUnion { .. } + | State::Fail + | State::Match { .. } => {} + } + + let id = StateID::new(self.states.len()).unwrap(); + self.memory_extra += state.memory_usage(); + self.states.push(state); + id + } + + /// Set the starting state identifiers for this NFA. + /// + /// `start_anchored` and `start_unanchored` may be equivalent. When they + /// are, then the NFA can only execute anchored searches. This might + /// occur, for example, for patterns that are unconditionally anchored. + /// e.g., `^foo`. + pub(super) fn set_starts( + &mut self, + start_anchored: StateID, + start_unanchored: StateID, + start_pattern: &[StateID], + ) { + self.start_anchored = start_anchored; + self.start_unanchored = start_unanchored; + self.start_pattern = start_pattern.to_vec(); + } + + /// Sets the UTF-8 mode of this NFA. + pub(super) fn set_utf8(&mut self, yes: bool) { + self.utf8 = yes; + } + + /// Sets the reverse mode of this NFA. + pub(super) fn set_reverse(&mut self, yes: bool) { + self.reverse = yes; + } + + /// Sets the look-around assertion matcher for this NFA. + pub(super) fn set_look_matcher(&mut self, m: LookMatcher) { + self.look_matcher = m; + } + + /// Set the capturing groups for this NFA. + /// + /// The given slice should contain the capturing groups for each pattern, + /// The capturing groups in turn should correspond to the total number of + /// capturing groups in the pattern, including the anonymous first capture + /// group for each pattern. If a capturing group does have a name, then it + /// should be provided as a Arc<str>. + /// + /// This returns an error if a corresponding `GroupInfo` could not be + /// built. + pub(super) fn set_captures( + &mut self, + captures: &[Vec<Option<Arc<str>>>], + ) -> Result<(), GroupInfoError> { + self.group_info = GroupInfo::new( + captures.iter().map(|x| x.iter().map(|y| y.as_ref())), + )?; + Ok(()) + } + + /// Remap the transitions in every state of this NFA using the given map. + /// The given map should be indexed according to state ID namespace used by + /// the transitions of the states currently in this NFA. + /// + /// This is particularly useful to the NFA builder, since it is convenient + /// to add NFA states in order to produce their final IDs. Then, after all + /// of the intermediate "empty" states (unconditional epsilon transitions) + /// have been removed from the builder's representation, we can re-map all + /// of the transitions in the states already added to their final IDs. + pub(super) fn remap(&mut self, old_to_new: &[StateID]) { + for state in &mut self.states { + state.remap(old_to_new); + } + self.start_anchored = old_to_new[self.start_anchored]; + self.start_unanchored = old_to_new[self.start_unanchored]; + for id in self.start_pattern.iter_mut() { + *id = old_to_new[*id]; + } + } +} + +impl fmt::Debug for Inner { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "thompson::NFA(")?; + for (sid, state) in self.states.iter().with_state_ids() { + let status = if sid == self.start_anchored { + '^' + } else if sid == self.start_unanchored { + '>' + } else { + ' ' + }; + writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; + } + let pattern_len = self.start_pattern.len(); + if pattern_len > 1 { + writeln!(f, "")?; + for pid in 0..pattern_len { + let sid = self.start_pattern[pid]; + writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?; + } + } + writeln!(f, "")?; + writeln!( + f, + "transition equivalence classes: {:?}", + self.byte_classes, + )?; + writeln!(f, ")")?; + Ok(()) + } +} + +/// A state in an NFA. +/// +/// In theory, it can help to conceptualize an `NFA` as a graph consisting of +/// `State`s. Each `State` contains its complete set of outgoing transitions. +/// +/// In practice, it can help to conceptualize an `NFA` as a sequence of +/// instructions for a virtual machine. Each `State` says what to do and where +/// to go next. +/// +/// Strictly speaking, the practical interpretation is the most correct one, +/// because of the [`Capture`](State::Capture) state. Namely, a `Capture` +/// state always forwards execution to another state unconditionally. Its only +/// purpose is to cause a side effect: the recording of the current input +/// position at a particular location in memory. In this sense, an `NFA` +/// has more power than a theoretical non-deterministic finite automaton. +/// +/// For most uses of this crate, it is likely that one may never even need to +/// be aware of this type at all. The main use cases for looking at `State`s +/// directly are if you need to write your own search implementation or if you +/// need to do some kind of analysis on the NFA. +#[derive(Clone, Eq, PartialEq)] +pub enum State { + /// A state with a single transition that can only be taken if the current + /// input symbol is in a particular range of bytes. + ByteRange { + /// The transition from this state to the next. + trans: Transition, + }, + /// A state with possibly many transitions represented in a sparse fashion. + /// Transitions are non-overlapping and ordered lexicographically by input + /// range. + /// + /// In practice, this is used for encoding UTF-8 automata. Its presence is + /// primarily an optimization that avoids many additional unconditional + /// epsilon transitions (via [`Union`](State::Union) states), and thus + /// decreases the overhead of traversing the NFA. This can improve both + /// matching time and DFA construction time. + Sparse(SparseTransitions), + /// A dense representation of a state with multiple transitions. + Dense(DenseTransitions), + /// A conditional epsilon transition satisfied via some sort of + /// look-around. Look-around is limited to anchor and word boundary + /// assertions. + /// + /// Look-around states are meant to be evaluated while performing epsilon + /// closure (computing the set of states reachable from a particular state + /// via only epsilon transitions). If the current position in the haystack + /// satisfies the look-around assertion, then you're permitted to follow + /// that epsilon transition. + Look { + /// The look-around assertion that must be satisfied before moving + /// to `next`. + look: Look, + /// The state to transition to if the look-around assertion is + /// satisfied. + next: StateID, + }, + /// An alternation such that there exists an epsilon transition to all + /// states in `alternates`, where matches found via earlier transitions + /// are preferred over later transitions. + Union { + /// An ordered sequence of unconditional epsilon transitions to other + /// states. Transitions earlier in the sequence are preferred over + /// transitions later in the sequence. + alternates: Box<[StateID]>, + }, + /// An alternation such that there exists precisely two unconditional + /// epsilon transitions, where matches found via `alt1` are preferred over + /// matches found via `alt2`. + /// + /// This state exists as a common special case of Union where there are + /// only two alternates. In this case, we don't need any allocations to + /// represent the state. This saves a bit of memory and also saves an + /// additional memory access when traversing the NFA. + BinaryUnion { + /// An unconditional epsilon transition to another NFA state. This + /// is preferred over `alt2`. + alt1: StateID, + /// An unconditional epsilon transition to another NFA state. Matches + /// reported via this transition should only be reported if no matches + /// were found by following `alt1`. + alt2: StateID, + }, + /// An empty state that records a capture location. + /// + /// From the perspective of finite automata, this is precisely equivalent + /// to an unconditional epsilon transition, but serves the purpose of + /// instructing NFA simulations to record additional state when the finite + /// state machine passes through this epsilon transition. + /// + /// `slot` in this context refers to the specific capture group slot + /// offset that is being recorded. Each capturing group has two slots + /// corresponding to the start and end of the matching portion of that + /// group. + /// + /// The pattern ID and capture group index are also included in this state + /// in case they are useful. But mostly, all you'll need is `next` and + /// `slot`. + Capture { + /// The state to transition to, unconditionally. + next: StateID, + /// The pattern ID that this capture belongs to. + pattern_id: PatternID, + /// The capture group index that this capture belongs to. Capture group + /// indices are local to each pattern. For example, when capturing + /// groups are enabled, every pattern has a capture group at index + /// `0`. + group_index: SmallIndex, + /// The slot index for this capture. Every capturing group has two + /// slots: one for the start haystack offset and one for the end + /// haystack offset. Unlike capture group indices, slot indices are + /// global across all patterns in this NFA. That is, each slot belongs + /// to a single pattern, but there is only one slot at index `i`. + slot: SmallIndex, + }, + /// A state that cannot be transitioned out of. This is useful for cases + /// where you want to prevent matching from occurring. For example, if your + /// regex parser permits empty character classes, then one could choose + /// a `Fail` state to represent them. (An empty character class can be + /// thought of as an empty set. Since nothing is in an empty set, they can + /// never match anything.) + Fail, + /// A match state. There is at least one such occurrence of this state for + /// each regex that can match that is in this NFA. + Match { + /// The matching pattern ID. + pattern_id: PatternID, + }, +} + +impl State { + /// Returns true if and only if this state contains one or more epsilon + /// transitions. + /// + /// In practice, a state has no outgoing transitions (like `Match`), has + /// only non-epsilon transitions (like `ByteRange`) or has only epsilon + /// transitions (like `Union`). + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{State, Transition}, + /// util::primitives::{PatternID, StateID, SmallIndex}, + /// }; + /// + /// // Capture states are epsilon transitions. + /// let state = State::Capture { + /// next: StateID::ZERO, + /// pattern_id: PatternID::ZERO, + /// group_index: SmallIndex::ZERO, + /// slot: SmallIndex::ZERO, + /// }; + /// assert!(state.is_epsilon()); + /// + /// // ByteRange states are not. + /// let state = State::ByteRange { + /// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO }, + /// }; + /// assert!(!state.is_epsilon()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_epsilon(&self) -> bool { + match *self { + State::ByteRange { .. } + | State::Sparse { .. } + | State::Dense { .. } + | State::Fail + | State::Match { .. } => false, + State::Look { .. } + | State::Union { .. } + | State::BinaryUnion { .. } + | State::Capture { .. } => true, + } + } + + /// Returns the heap memory usage of this NFA state in bytes. + fn memory_usage(&self) -> usize { + match *self { + State::ByteRange { .. } + | State::Look { .. } + | State::BinaryUnion { .. } + | State::Capture { .. } + | State::Match { .. } + | State::Fail => 0, + State::Sparse(SparseTransitions { ref transitions }) => { + transitions.len() * mem::size_of::<Transition>() + } + State::Dense { .. } => 256 * mem::size_of::<StateID>(), + State::Union { ref alternates } => { + alternates.len() * mem::size_of::<StateID>() + } + } + } + + /// Remap the transitions in this state using the given map. Namely, the + /// given map should be indexed according to the transitions currently + /// in this state. + /// + /// This is used during the final phase of the NFA compiler, which turns + /// its intermediate NFA into the final NFA. + fn remap(&mut self, remap: &[StateID]) { + match *self { + State::ByteRange { ref mut trans } => { + trans.next = remap[trans.next] + } + State::Sparse(SparseTransitions { ref mut transitions }) => { + for t in transitions.iter_mut() { + t.next = remap[t.next]; + } + } + State::Dense(DenseTransitions { ref mut transitions }) => { + for sid in transitions.iter_mut() { + *sid = remap[*sid]; + } + } + State::Look { ref mut next, .. } => *next = remap[*next], + State::Union { ref mut alternates } => { + for alt in alternates.iter_mut() { + *alt = remap[*alt]; + } + } + State::BinaryUnion { ref mut alt1, ref mut alt2 } => { + *alt1 = remap[*alt1]; + *alt2 = remap[*alt2]; + } + State::Capture { ref mut next, .. } => *next = remap[*next], + State::Fail => {} + State::Match { .. } => {} + } + } +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + State::ByteRange { ref trans } => trans.fmt(f), + State::Sparse(SparseTransitions { ref transitions }) => { + let rs = transitions + .iter() + .map(|t| format!("{:?}", t)) + .collect::<Vec<String>>() + .join(", "); + write!(f, "sparse({})", rs) + } + State::Dense(ref dense) => { + write!(f, "dense(")?; + for (i, t) in dense.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{:?}", t)?; + } + write!(f, ")") + } + State::Look { ref look, next } => { + write!(f, "{:?} => {:?}", look, next.as_usize()) + } + State::Union { ref alternates } => { + let alts = alternates + .iter() + .map(|id| format!("{:?}", id.as_usize())) + .collect::<Vec<String>>() + .join(", "); + write!(f, "union({})", alts) + } + State::BinaryUnion { alt1, alt2 } => { + write!( + f, + "binary-union({}, {})", + alt1.as_usize(), + alt2.as_usize() + ) + } + State::Capture { next, pattern_id, group_index, slot } => { + write!( + f, + "capture(pid={:?}, group={:?}, slot={:?}) => {:?}", + pattern_id.as_usize(), + group_index.as_usize(), + slot.as_usize(), + next.as_usize(), + ) + } + State::Fail => write!(f, "FAIL"), + State::Match { pattern_id } => { + write!(f, "MATCH({:?})", pattern_id.as_usize()) + } + } + } +} + +/// A sequence of transitions used to represent a sparse state. +/// +/// This is the primary representation of a [`Sparse`](State::Sparse) state. +/// It corresponds to a sorted sequence of transitions with non-overlapping +/// byte ranges. If the byte at the current position in the haystack matches +/// one of the byte ranges, then the finite state machine should take the +/// corresponding transition. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct SparseTransitions { + /// The sorted sequence of non-overlapping transitions. + pub transitions: Box<[Transition]>, +} + +impl SparseTransitions { + /// This follows the matching transition for a particular byte. + /// + /// The matching transition is found by looking for a matching byte + /// range (there is at most one) corresponding to the position `at` in + /// `haystack`. + /// + /// If `at >= haystack.len()`, then this returns `None`. + #[inline] + pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { + haystack.get(at).and_then(|&b| self.matches_byte(b)) + } + + /// This follows the matching transition for any member of the alphabet. + /// + /// The matching transition is found by looking for a matching byte + /// range (there is at most one) corresponding to the position `at` in + /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi), + /// then this always returns `None`. + #[inline] + pub(crate) fn matches_unit( + &self, + unit: alphabet::Unit, + ) -> Option<StateID> { + unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + } + + /// This follows the matching transition for a particular byte. + /// + /// The matching transition is found by looking for a matching byte range + /// (there is at most one) corresponding to the byte given. + #[inline] + pub fn matches_byte(&self, byte: u8) -> Option<StateID> { + for t in self.transitions.iter() { + if t.start > byte { + break; + } else if t.matches_byte(byte) { + return Some(t.next); + } + } + None + + /* + // This is an alternative implementation that uses binary search. In + // some ad hoc experiments, like + // + // smallishru=OpenSubtitles2018.raw.sample.smallish.ru + // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b' + // + // I could not observe any improvement, and in fact, things seemed to + // be a bit slower. I can see an improvement in at least one benchmark: + // + // allcpssmall=all-codepoints-utf8-10x + // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}' + // + // Where total search time goes from 3.2s to 2.4s when using binary + // search. + self.transitions + .binary_search_by(|t| { + if t.end < byte { + core::cmp::Ordering::Less + } else if t.start > byte { + core::cmp::Ordering::Greater + } else { + core::cmp::Ordering::Equal + } + }) + .ok() + .map(|i| self.transitions[i].next) + */ + } +} + +/// A sequence of transitions used to represent a dense state. +/// +/// This is the primary representation of a [`Dense`](State::Dense) state. It +/// provides constant time matching. That is, given a byte in a haystack and +/// a `DenseTransitions`, one can determine if the state matches in constant +/// time. +/// +/// This is in contrast to `SparseTransitions`, whose time complexity is +/// necessarily bigger than constant time. Also in contrast, `DenseTransitions` +/// usually requires (much) more heap memory. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct DenseTransitions { + /// A dense representation of this state's transitions on the heap. This + /// always has length 256. + pub transitions: Box<[StateID]>, +} + +impl DenseTransitions { + /// This follows the matching transition for a particular byte. + /// + /// The matching transition is found by looking for a transition that + /// doesn't correspond to `StateID::ZERO` for the byte `at` the given + /// position in `haystack`. + /// + /// If `at >= haystack.len()`, then this returns `None`. + #[inline] + pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> { + haystack.get(at).and_then(|&b| self.matches_byte(b)) + } + + /// This follows the matching transition for any member of the alphabet. + /// + /// The matching transition is found by looking for a transition that + /// doesn't correspond to `StateID::ZERO` for the byte `at` the given + /// position in `haystack`. + /// + /// If `at >= haystack.len()` or if the given alphabet unit is + /// [`EOI`](alphabet::Unit::eoi), then this returns `None`. + #[inline] + pub(crate) fn matches_unit( + &self, + unit: alphabet::Unit, + ) -> Option<StateID> { + unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) + } + + /// This follows the matching transition for a particular byte. + /// + /// The matching transition is found by looking for a transition that + /// doesn't correspond to `StateID::ZERO` for the given `byte`. + /// + /// If `at >= haystack.len()`, then this returns `None`. + #[inline] + pub fn matches_byte(&self, byte: u8) -> Option<StateID> { + let next = self.transitions[usize::from(byte)]; + if next == StateID::ZERO { + None + } else { + Some(next) + } + } + + /* + /// The dense state optimization isn't currently enabled, so permit a + /// little bit of dead code. + pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions { + let mut dense = vec![StateID::ZERO; 256]; + for t in sparse.transitions.iter() { + for b in t.start..=t.end { + dense[usize::from(b)] = t.next; + } + } + DenseTransitions { transitions: dense.into_boxed_slice() } + } + */ + + /// Returns an iterator over all transitions that don't point to + /// `StateID::ZERO`. + pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ { + use crate::util::int::Usize; + self.transitions + .iter() + .enumerate() + .filter(|&(_, &sid)| sid != StateID::ZERO) + .map(|(byte, &next)| Transition { + start: byte.as_u8(), + end: byte.as_u8(), + next, + }) + } +} + +/// A single transition to another state. +/// +/// This transition may only be followed if the current byte in the haystack +/// falls in the inclusive range of bytes specified. +#[derive(Clone, Copy, Eq, Hash, PartialEq)] +pub struct Transition { + /// The inclusive start of the byte range. + pub start: u8, + /// The inclusive end of the byte range. + pub end: u8, + /// The identifier of the state to transition to. + pub next: StateID, +} + +impl Transition { + /// Returns true if the position `at` in `haystack` falls in this + /// transition's range of bytes. + /// + /// If `at >= haystack.len()`, then this returns `false`. + pub fn matches(&self, haystack: &[u8], at: usize) -> bool { + haystack.get(at).map_or(false, |&b| self.matches_byte(b)) + } + + /// Returns true if the given alphabet unit falls in this transition's + /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then + /// this returns `false`. + pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { + unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) + } + + /// Returns true if the given byte falls in this transition's range of + /// bytes. + pub fn matches_byte(&self, byte: u8) -> bool { + self.start <= byte && byte <= self.end + } +} + +impl fmt::Debug for Transition { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use crate::util::escape::DebugByte; + + let Transition { start, end, next } = *self; + if self.start == self.end { + write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) + } else { + write!( + f, + "{:?}-{:?} => {:?}", + DebugByte(start), + DebugByte(end), + next.as_usize(), + ) + } + } +} + +/// An iterator over all pattern IDs in an NFA. +/// +/// This iterator is created by [`NFA::patterns`]. +/// +/// The lifetime parameter `'a` refers to the lifetime of the NFA from which +/// this pattern iterator was created. +#[derive(Debug)] +pub struct PatternIter<'a> { + it: PatternIDIter, + /// We explicitly associate a lifetime with this iterator even though we + /// don't actually borrow anything from the NFA. We do this for backward + /// compatibility purposes. If we ever do need to borrow something from + /// the NFA, then we can and just get rid of this marker without breaking + /// the public API. + _marker: core::marker::PhantomData<&'a ()>, +} + +impl<'a> Iterator for PatternIter<'a> { + type Item = PatternID; + + fn next(&mut self) -> Option<PatternID> { + self.it.next() + } +} + +#[cfg(all(test, feature = "nfa-pikevm"))] +mod tests { + use super::*; + use crate::{nfa::thompson::pikevm::PikeVM, Input}; + + // This asserts that an NFA state doesn't have its size changed. It is + // *really* easy to accidentally increase the size, and thus potentially + // dramatically increase the memory usage of every NFA. + // + // This assert doesn't mean we absolutely cannot increase the size of an + // NFA state. We can. It's just here to make sure we do it knowingly and + // intentionally. + #[test] + fn state_has_small_size() { + #[cfg(target_pointer_width = "64")] + assert_eq!(24, core::mem::size_of::<State>()); + #[cfg(target_pointer_width = "32")] + assert_eq!(20, core::mem::size_of::<State>()); + } + + #[test] + fn always_match() { + let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap(); + let mut cache = re.create_cache(); + let mut caps = re.create_captures(); + let mut find = |haystack, start, end| { + let input = Input::new(haystack).range(start..end); + re.search(&mut cache, &input, &mut caps); + caps.get_match().map(|m| m.end()) + }; + + assert_eq!(Some(0), find("", 0, 0)); + assert_eq!(Some(0), find("a", 0, 1)); + assert_eq!(Some(1), find("a", 1, 1)); + assert_eq!(Some(0), find("ab", 0, 2)); + assert_eq!(Some(1), find("ab", 1, 2)); + assert_eq!(Some(2), find("ab", 2, 2)); + } + + #[test] + fn never_match() { + let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap(); + let mut cache = re.create_cache(); + let mut caps = re.create_captures(); + let mut find = |haystack, start, end| { + let input = Input::new(haystack).range(start..end); + re.search(&mut cache, &input, &mut caps); + caps.get_match().map(|m| m.end()) + }; + + assert_eq!(None, find("", 0, 0)); + assert_eq!(None, find("a", 0, 1)); + assert_eq!(None, find("a", 1, 1)); + assert_eq!(None, find("ab", 0, 2)); + assert_eq!(None, find("ab", 1, 2)); + assert_eq!(None, find("ab", 2, 2)); + } +} diff --git a/vendor/regex-automata/src/nfa/thompson/pikevm.rs b/vendor/regex-automata/src/nfa/thompson/pikevm.rs index 7572f9f10..0128c151a 100644 --- a/vendor/regex-automata/src/nfa/thompson/pikevm.rs +++ b/vendor/regex-automata/src/nfa/thompson/pikevm.rs @@ -1,18 +1,71 @@ -use alloc::{sync::Arc, vec, vec::Vec}; +/*! +An NFA backed Pike VM for executing regex searches with capturing groups. + +This module provides a [`PikeVM`] that works by simulating an NFA and +resolving all spans of capturing groups that participate in a match. +*/ + +#[cfg(feature = "internal-instrument-pikevm")] +use core::cell::RefCell; + +use alloc::{vec, vec::Vec}; use crate::{ - nfa::thompson::{self, Error, State, NFA}, + nfa::thompson::{self, BuildError, State, NFA}, util::{ - id::{PatternID, StateID}, - matchtypes::MultiMatch, + captures::Captures, + empty, iter, + prefilter::Prefilter, + primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, + search::{ + Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, + }, sparse_set::SparseSet, }, }; -#[derive(Clone, Copy, Debug, Default)] +/// A simple macro for conditionally executing instrumentation logic when +/// the 'trace' log level is enabled. This is a compile-time no-op when the +/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that +/// this makes it easier to avoid doing extra work when instrumentation isn't +/// enabled. +/// +/// This macro accepts a closure of type `|&mut Counters|`. The closure can +/// then increment counters (or whatever) in accordance with what one wants +/// to track. +macro_rules! instrument { + ($fun:expr) => { + #[cfg(feature = "internal-instrument-pikevm")] + { + let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; + COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut())); + } + }; +} + +#[cfg(feature = "internal-instrument-pikevm")] +std::thread_local! { + /// Effectively global state used to keep track of instrumentation + /// counters. The "proper" way to do this is to thread it through the + /// PikeVM, but it makes the code quite icky. Since this is just a + /// debugging feature, we're content to relegate it to thread local + /// state. When instrumentation is enabled, the counters are reset at the + /// beginning of every search and printed (with the 'trace' log level) at + /// the end of every search. + static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty()); +} + +/// The configuration used for building a [`PikeVM`]. +/// +/// A PikeVM configuration is a simple data object that is typically used with +/// [`Builder::configure`]. It can be cheaply cloned. +/// +/// A default configuration can be created either with `Config::new`, or +/// perhaps more conveniently, with [`PikeVM::config`]. +#[derive(Clone, Debug, Default)] pub struct Config { - anchored: Option<bool>, - utf8: Option<bool>, + match_kind: Option<MatchKind>, + pre: Option<Option<Prefilter>>, } impl Config { @@ -21,37 +74,172 @@ impl Config { Config::default() } - pub fn anchored(mut self, yes: bool) -> Config { - self.anchored = Some(yes); + /// Set the desired match semantics. + /// + /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the + /// match semantics of Perl-like regex engines. That is, when multiple + /// patterns would match at the same leftmost position, the pattern that + /// appears first in the concrete syntax is chosen. + /// + /// Currently, the only other kind of match semantics supported is + /// [`MatchKind::All`]. This corresponds to "classical DFA" construction + /// where all possible matches are visited in the NFA by the `PikeVM`. + /// + /// Typically, `All` is used when one wants to execute an overlapping + /// search and `LeftmostFirst` otherwise. In particular, it rarely makes + /// sense to use `All` with the various "leftmost" find routines, since the + /// leftmost routines depend on the `LeftmostFirst` automata construction + /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` + /// simulating dead states as a way to terminate the search and report a + /// match. `LeftmostFirst` also supports non-greedy matches using this + /// strategy where as `All` does not. + pub fn match_kind(mut self, kind: MatchKind) -> Config { + self.match_kind = Some(kind); self } - pub fn utf8(mut self, yes: bool) -> Config { - self.utf8 = Some(yes); + /// Set a prefilter to be used whenever a start state is entered. + /// + /// A [`Prefilter`] in this context is meant to accelerate searches by + /// looking for literal prefixes that every match for the corresponding + /// pattern (or patterns) must start with. Once a prefilter produces a + /// match, the underlying search routine continues on to try and confirm + /// the match. + /// + /// Be warned that setting a prefilter does not guarantee that the search + /// will be faster. While it's usually a good bet, if the prefilter + /// produces a lot of false positive candidates (i.e., positions matched + /// by the prefilter but not by the regex), then the overall result can + /// be slower than if you had just executed the regex engine without any + /// prefilters. + /// + /// By default no prefilter is set. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::prefilter::Prefilter, + /// Input, Match, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Be warned though that an incorrect prefilter can lead to incorrect + /// results! + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// util::prefilter::Prefilter, + /// Input, HalfMatch, MatchKind, + /// }; + /// + /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().prefilter(pre)) + /// .build(r"(foo|bar)[a-z]+")?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("foo1 barfox bar"); + /// // No match reported even though there clearly is one! + /// assert_eq!(None, re.find(&mut cache, input)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { + self.pre = Some(pre); self } - pub fn get_anchored(&self) -> bool { - self.anchored.unwrap_or(false) + /// Returns the match semantics set in this configuration. + pub fn get_match_kind(&self) -> MatchKind { + self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } - pub fn get_utf8(&self) -> bool { - self.utf8.unwrap_or(true) + /// Returns the prefilter set in this configuration, if one at all. + pub fn get_prefilter(&self) -> Option<&Prefilter> { + self.pre.as_ref().unwrap_or(&None).as_ref() } - pub(crate) fn overwrite(self, o: Config) -> Config { + /// Overwrite the default configuration such that the options in `o` are + /// always used. If an option in `o` is not set, then the corresponding + /// option in `self` is used. If it's not set in `self` either, then it + /// remains not set. + pub(crate) fn overwrite(&self, o: Config) -> Config { Config { - anchored: o.anchored.or(self.anchored), - utf8: o.utf8.or(self.utf8), + match_kind: o.match_kind.or(self.match_kind), + pre: o.pre.or_else(|| self.pre.clone()), } } } -/// A builder for a PikeVM. +/// A builder for a `PikeVM`. +/// +/// This builder permits configuring options for the syntax of a pattern, +/// the NFA construction and the `PikeVM` construction. This builder is +/// different from a general purpose regex builder in that it permits fine +/// grain configuration of the construction process. The trade off for this is +/// complexity, and the possibility of setting a configuration that might not +/// make sense. For example, there are two different UTF-8 modes: +/// +/// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) +/// controls whether the pattern itself can contain sub-expressions that match +/// invalid UTF-8. +/// * [`thompson::Config::utf8`] controls whether empty matches that split a +/// Unicode codepoint are reported or not. +/// +/// Generally speaking, callers will want to either enable all of these or +/// disable all of these. +/// +/// # Example +/// +/// This example shows how to disable UTF-8 mode in the syntax and the regex +/// itself. This is generally what you want for matching on arbitrary bytes. +/// +/// ``` +/// use regex_automata::{ +/// nfa::thompson::{self, pikevm::PikeVM}, +/// util::syntax, +/// Match, +/// }; +/// +/// let re = PikeVM::builder() +/// .syntax(syntax::Config::new().utf8(false)) +/// .thompson(thompson::Config::new().utf8(false)) +/// .build(r"foo(?-u:[^b])ar.*")?; +/// let mut cache = re.create_cache(); +/// +/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; +/// let expected = Some(Match::must(0, 1..9)); +/// let got = re.find_iter(&mut cache, haystack).next(); +/// assert_eq!(expected, got); +/// // Notice that `(?-u:[^b])` matches invalid UTF-8, +/// // but the subsequent `.*` does not! Disabling UTF-8 +/// // on the syntax permits this. +/// // +/// // N.B. This example does not show the impact of +/// // disabling UTF-8 mode on a PikeVM Config, since that +/// // only impacts regexes that can produce matches of +/// // length 0. +/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, - thompson: thompson::Builder, + #[cfg(feature = "syntax")] + thompson: thompson::Compiler, } impl Builder { @@ -59,53 +247,58 @@ impl Builder { pub fn new() -> Builder { Builder { config: Config::default(), - thompson: thompson::Builder::new(), + #[cfg(feature = "syntax")] + thompson: thompson::Compiler::new(), } } - pub fn build(&self, pattern: &str) -> Result<PikeVM, Error> { + /// Build a `PikeVM` from the given pattern. + /// + /// If there was a problem parsing or compiling the pattern, then an error + /// is returned. + #[cfg(feature = "syntax")] + pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> { self.build_many(&[pattern]) } + /// Build a `PikeVM` from the given patterns. + #[cfg(feature = "syntax")] pub fn build_many<P: AsRef<str>>( &self, patterns: &[P], - ) -> Result<PikeVM, Error> { + ) -> Result<PikeVM, BuildError> { let nfa = self.thompson.build_many(patterns)?; - self.build_from_nfa(Arc::new(nfa)) - } - - pub fn build_from_nfa(&self, nfa: Arc<NFA>) -> Result<PikeVM, Error> { - // TODO: Check that this is correct. - // if !cfg!(all( - // feature = "dfa", - // feature = "syntax", - // feature = "unicode-perl" - // )) { - if !cfg!(feature = "syntax") { - if nfa.has_word_boundary_unicode() { - return Err(Error::unicode_word_unavailable()); - } - } - Ok(PikeVM { config: self.config, nfa }) + self.build_from_nfa(nfa) + } + + /// Build a `PikeVM` directly from its NFA. + /// + /// Note that when using this method, any configuration that applies to the + /// construction of the NFA itself will of course be ignored, since the NFA + /// given here is already built. + pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> { + nfa.look_set_any().available().map_err(BuildError::word)?; + Ok(PikeVM { config: self.config.clone(), nfa }) } + /// Apply the given `PikeVM` configuration options to this builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using - /// [`SyntaxConfig`](crate::SyntaxConfig). + /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a PikeVM directly from a /// pattern. + #[cfg(feature = "syntax")] pub fn syntax( &mut self, - config: crate::util::syntax::SyntaxConfig, + config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self @@ -119,259 +312,1395 @@ impl Builder { /// /// These settings only apply when constructing a PikeVM directly from a /// pattern. + #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } +/// A virtual machine for executing regex searches with capturing groups. +/// +/// # Infallible APIs +/// +/// Unlike most other regex engines in this crate, a `PikeVM` never returns an +/// error at search time. It supports all [`Anchored`] configurations, never +/// quits and works on haystacks of arbitrary length. +/// +/// There are two caveats to mention though: +/// +/// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], +/// then the PikeVM will report "no match." This is consistent with all other +/// regex engines in this crate. +/// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] +/// that has insufficient capacity to store all valid pattern IDs, then if a +/// match occurs for a `PatternID` that cannot be inserted, it is silently +/// dropped as if it did not match. +/// +/// # Advice +/// +/// The `PikeVM` is generally the most "powerful" regex engine in this crate. +/// "Powerful" in this context means that it can handle any regular expression +/// that is parseable by `regex-syntax` and any size haystack. Regretably, +/// the `PikeVM` is also simultaneously often the _slowest_ regex engine in +/// practice. This results in an annoying situation where one generally tries +/// to pick any other regex engine (or perhaps none at all) before being +/// forced to fall back to a `PikeVM`. +/// +/// For example, a common strategy for dealing with capturing groups is to +/// actually look for the overall match of the regex using a faster regex +/// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall +/// match is found, one can then run the `PikeVM` on just the match span to +/// find the spans of the capturing groups. In this way, the faster regex +/// engine does the majority of the work, while the `PikeVM` only lends its +/// power in a more limited role. +/// +/// Unfortunately, this isn't always possible because the faster regex engines +/// don't support all of the regex features in `regex-syntax`. This notably +/// includes (and is currently limited to) Unicode word boundaries. So if +/// your pattern has Unicode word boundaries, you typically can't use a +/// DFA-based regex engine at all (unless you [enable heuristic support for +/// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass +/// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for +/// anchored searches only, but in a cruel sort of joke, many Unicode features +/// tend to result in making the regex _not_ one-pass.) +/// +/// # Example +/// +/// This example shows that the `PikeVM` implements Unicode word boundaries +/// correctly by default. +/// +/// ``` +/// # if cfg!(miri) { return Ok(()); } // miri takes too long +/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; +/// +/// let re = PikeVM::new(r"\b\w+\b")?; +/// let mut cache = re.create_cache(); +/// +/// let mut it = re.find_iter(&mut cache, "Шерлок Холмс"); +/// assert_eq!(Some(Match::must(0, 0..12)), it.next()); +/// assert_eq!(Some(Match::must(0, 13..23)), it.next()); +/// assert_eq!(None, it.next()); +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` #[derive(Clone, Debug)] pub struct PikeVM { config: Config, - nfa: Arc<NFA>, + nfa: NFA, } impl PikeVM { - pub fn new(pattern: &str) -> Result<PikeVM, Error> { + /// Parse the given regular expression using the default configuration and + /// return the corresponding `PikeVM`. + /// + /// If you want a non-default configuration, then use the [`Builder`] to + /// set your own configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 3..14)), + /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(), + /// ); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new(pattern: &str) -> Result<PikeVM, BuildError> { PikeVM::builder().build(pattern) } - pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<PikeVM, Error> { + /// Like `new`, but parses multiple patterns into a single "multi regex." + /// This similarly uses the default regex configuration. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?; + /// let mut cache = re.create_cache(); + /// + /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); + /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); + /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); + /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); + /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); + /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); + /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); + /// assert_eq!(None, it.next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[cfg(feature = "syntax")] + pub fn new_many<P: AsRef<str>>( + patterns: &[P], + ) -> Result<PikeVM, BuildError> { PikeVM::builder().build_many(patterns) } + /// Like `new`, but builds a PikeVM directly from an NFA. This is useful + /// if you already have an NFA, or even if you hand-assembled the NFA. + /// + /// # Example + /// + /// This shows how to hand assemble a regular expression via its HIR, + /// compile an NFA from it and build a PikeVM from the NFA. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; + /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; + /// + /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ + /// ClassBytesRange::new(b'0', b'9'), + /// ClassBytesRange::new(b'A', b'Z'), + /// ClassBytesRange::new(b'_', b'_'), + /// ClassBytesRange::new(b'a', b'z'), + /// ]))); + /// + /// let config = NFA::config().nfa_size_limit(Some(1_000)); + /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; + /// + /// let re = PikeVM::new_from_nfa(nfa)?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let expected = Some(Match::must(0, 3..4)); + /// re.captures(&mut cache, "!@#A#@!", &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> { + PikeVM::builder().build_from_nfa(nfa) + } + + /// Create a new `PikeVM` that matches every input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::always_match()?; + /// let mut cache = re.create_cache(); + /// + /// let expected = Match::must(0, 0..0); + /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next()); + /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn always_match() -> Result<PikeVM, BuildError> { + let nfa = thompson::NFA::always_match(); + PikeVM::new_from_nfa(nfa) + } + + /// Create a new `PikeVM` that never matches any input. + /// + /// # Example + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::never_match()?; + /// let mut cache = re.create_cache(); + /// + /// assert_eq!(None, re.find_iter(&mut cache, "").next()); + /// assert_eq!(None, re.find_iter(&mut cache, "foo").next()); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn never_match() -> Result<PikeVM, BuildError> { + let nfa = thompson::NFA::never_match(); + PikeVM::new_from_nfa(nfa) + } + + /// Return a default configuration for a `PikeVM`. + /// + /// This is a convenience routine to avoid needing to import the `Config` + /// type when customizing the construction of a `PikeVM`. + /// + /// # Example + /// + /// This example shows how to disable UTF-8 mode. When UTF-8 mode is + /// disabled, zero-width matches that split a codepoint are allowed. + /// Otherwise they are never reported. + /// + /// In the code below, notice that `""` is permitted to match positions + /// that split the encoding of a codepoint. + /// + /// ``` + /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; + /// + /// let re = PikeVM::builder() + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"")?; + /// let mut cache = re.create_cache(); + /// + /// let haystack = "a☃z"; + /// let mut it = re.find_iter(&mut cache, haystack); + /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); + /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); + /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); + /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); + /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); + /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); + /// assert_eq!(None, it.next()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn config() -> Config { Config::new() } + /// Return a builder for configuring the construction of a `PikeVM`. + /// + /// This is a convenience routine to avoid needing to import the + /// [`Builder`] type in common cases. + /// + /// # Example + /// + /// This example shows how to use the builder to disable UTF-8 mode + /// everywhere. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::{self, pikevm::PikeVM}, + /// util::syntax, + /// Match, + /// }; + /// + /// let re = PikeVM::builder() + /// .syntax(syntax::Config::new().utf8(false)) + /// .thompson(thompson::Config::new().utf8(false)) + /// .build(r"foo(?-u:[^b])ar.*")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; + /// let expected = Some(Match::must(0, 1..9)); + /// re.captures(&mut cache, haystack, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` pub fn builder() -> Builder { Builder::new() } + /// Create a new empty set of capturing groups that is guaranteed to be + /// valid for the search APIs on this `PikeVM`. + /// + /// A `Captures` value created for a specific `PikeVM` cannot be used with + /// any other `PikeVM`. + /// + /// This is a convenience function for [`Captures::all`]. See the + /// [`Captures`] documentation for an explanation of its alternative + /// constructors that permit the `PikeVM` to do less work during a search, + /// and thus might make it faster. + pub fn create_captures(&self) -> Captures { + Captures::all(self.get_nfa().group_info().clone()) + } + + /// Create a new cache for this `PikeVM`. + /// + /// The cache returned should only be used for searches for this + /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then + /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, + /// [`PikeVM::reset_cache`]). pub fn create_cache(&self) -> Cache { - Cache::new(self.nfa()) + Cache::new(self) } - pub fn create_captures(&self) -> Captures { - Captures::new(self.nfa()) + /// Reset the given cache such that it can be used for searching with the + /// this `PikeVM` (and only this `PikeVM`). + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `PikeVM`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `PikeVM`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re1 = PikeVM::new(r"\w")?; + /// let re2 = PikeVM::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the PikeVM we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// re2.reset_cache(&mut cache); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset_cache(&self, cache: &mut Cache) { + cache.reset(self); } - pub fn nfa(&self) -> &Arc<NFA> { + /// Returns the total number of patterns compiled into this `PikeVM`. + /// + /// In the case of a `PikeVM` that contains no patterns, this returns `0`. + /// + /// # Example + /// + /// This example shows the pattern length for a `PikeVM` that never + /// matches: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::never_match()?; + /// assert_eq!(re.pattern_len(), 0); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And another example for a `PikeVM` that matches at every position: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::always_match()?; + /// assert_eq!(re.pattern_len(), 1); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// And finally, a `PikeVM` that was constructed from multiple patterns: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; + /// assert_eq!(re.pattern_len(), 3); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn pattern_len(&self) -> usize { + self.nfa.pattern_len() + } + + /// Return the config for this `PikeVM`. + #[inline] + pub fn get_config(&self) -> &Config { + &self.config + } + + /// Returns a reference to the underlying NFA. + #[inline] + pub fn get_nfa(&self) -> &NFA { &self.nfa } +} - pub fn find_leftmost_iter<'r, 'c, 't>( +impl PikeVM { + /// Returns true if and only if this `PikeVM` matches the given haystack. + /// + /// This routine may short circuit if it knows that scanning future + /// input will never lead to a different result. In particular, if the + /// underlying NFA enters a match state, then this routine will return + /// `true` immediately without inspecting any future input. (Consider how + /// this might make a difference given the regex `a+` on the haystack + /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, + /// but routines like `find` need to continue searching because `+` is + /// greedy by default.) + /// + /// # Example + /// + /// This shows basic usage: + /// + /// ``` + /// use regex_automata::nfa::thompson::pikevm::PikeVM; + /// + /// let re = PikeVM::new("foo[0-9]+bar")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, "foo12345bar")); + /// assert!(!re.is_match(&mut cache, "foobar")); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: consistency with search APIs + /// + /// `is_match` is guaranteed to return `true` whenever `find` returns a + /// match. This includes searches that are executed entirely within a + /// codepoint: + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; + /// + /// let re = PikeVM::new("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Notice that when UTF-8 mode is disabled, then the above reports a + /// match because the restriction against zero-width matches that split a + /// codepoint has been lifted: + /// + /// ``` + /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; + /// + /// let re = PikeVM::builder() + /// .thompson(NFA::config().utf8(false)) + /// .build("a*")?; + /// let mut cache = re.create_cache(); + /// + /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn is_match<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> bool { + let input = input.into().earliest(true); + self.search_slots(cache, &input, &mut []).is_some() + } + + /// Executes a leftmost forward search and returns a `Match` if one exists. + /// + /// This routine only includes the overall match span. To get access to the + /// individual spans of each capturing group, use [`PikeVM::captures`]. + /// + /// # Example + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..8); + /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over later parts. + /// let re = PikeVM::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..3); + /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + ) -> Option<Match> { + let input = input.into(); + if self.get_nfa().pattern_len() == 1 { + let mut slots = [None, None]; + let pid = self.search_slots(cache, &input, &mut slots)?; + let start = slots[0]?.get(); + let end = slots[1]?.get(); + return Some(Match::new(pid, Span { start, end })); + } + let ginfo = self.get_nfa().group_info(); + let slots_len = ginfo.implicit_slot_len(); + let mut slots = vec![None; slots_len]; + let pid = self.search_slots(cache, &input, &mut slots)?; + let start = slots[pid.as_usize() * 2]?.get(); + let end = slots[pid.as_usize() * 2 + 1]?.get(); + Some(Match::new(pid, Span { start, end })) + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// + /// re.captures(&mut cache, "2010-03-14", &mut caps); + /// assert!(caps.is_match()); + /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); + /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); + /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures<'h, I: Into<Input<'h>>>( + &self, + cache: &mut Cache, + input: I, + caps: &mut Captures, + ) { + self.search(cache, &input.into(), caps) + } + + /// Returns an iterator over all non-overlapping leftmost matches in the + /// given bytes. If no match exists, then the iterator yields no elements. + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re = PikeVM::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); + /// assert_eq!(matches, vec![ + /// Match::must(0, 0..4), + /// Match::must(0, 5..10), + /// Match::must(0, 11..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( + &'r self, + cache: &'c mut Cache, + input: I, + ) -> FindMatches<'r, 'c, 'h> { + let caps = Captures::matches(self.get_nfa().group_info().clone()); + let it = iter::Searcher::new(input.into()); + FindMatches { re: self, cache, caps, it } + } + + /// Returns an iterator over all non-overlapping `Captures` values. If no + /// match exists, then the iterator yields no elements. + /// + /// This yields the same matches as [`PikeVM::find_iter`], but it includes + /// the spans of all capturing groups that participate in each match. + /// + /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for + /// how to correctly iterate over all matches in a haystack while avoiding + /// the creation of a new `Captures` value for every match. (Which you are + /// forced to do with an `Iterator`.) + /// + /// # Example + /// + /// ``` + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; + /// + /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?; + /// let mut cache = re.create_cache(); + /// + /// let text = "foo1 foo12 foo123"; + /// let matches: Vec<Span> = re + /// .captures_iter(&mut cache, text) + /// // The unwrap is OK since 'numbers' matches if the pattern matches. + /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) + /// .collect(); + /// assert_eq!(matches, vec![ + /// Span::from(3..4), + /// Span::from(8..10), + /// Span::from(14..17), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( &'r self, cache: &'c mut Cache, - haystack: &'t [u8], - ) -> FindLeftmostMatches<'r, 'c, 't> { - FindLeftmostMatches::new(self, cache, haystack) - } - - // BREADCRUMBS: - // - // 1) Don't forget about prefilters. - // - // 2) Consider the case of using a PikeVM with an NFA that has Capture - // states, but where we don't want to track capturing groups (other than - // group 0). This potentially saves a lot of copying around and what not. I - // believe the current regex crate does this, for example. The interesting - // bit here is how to handle the case of multiple patterns... - // - // 3) Permit the caller to specify a pattern ID to run an anchored-only - // search on. - // - // 4) How to do overlapping? The way multi-regex support works in the regex - // crate currently is to run the PikeVM until either we reach the end of - // the haystack or when we know all regexes have matched. The latter case - // is probably quite rare, so the common case is likely that we're always - // searching the entire input. The question is: can we emulate that with - // our typical 'overlapping' APIs on DFAs? I believe we can. If so, then - // all we need to do is provide an overlapping API on the PikeVM that - // roughly matches the ones we provide on DFAs. For those APIs, the only - // thing they need over non-overlapping APIs is "caller state." For DFAs, - // the caller state is simple: it contains the last state visited and the - // last match reported. For the PikeVM (and NFAs in general), the "last - // state" is actually a *set* of NFA states. So I think what happens here - // is that we can just force the `Cache` to subsume this role. We'll still - // need some additional state to track the last match reported though. - // Because when two or more patterns match at the same location, we need a - // way to know to iterate over them. Although maybe it's not match index we - // need, but the state index of the last NFA state processed in the cache. - // Then we just pick up where we left off. There might be another match - // state, in which case, we report it. - - pub fn find_leftmost_at( + input: I, + ) -> CapturesMatches<'r, 'c, 'h> { + let caps = self.create_captures(); + let it = iter::Searcher::new(input.into()); + CapturesMatches { re: self, cache, caps, it } + } +} + +impl PikeVM { + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided [`Captures`] + /// value. If no match was found, then [`Captures::is_match`] is guaranteed + /// to return `false`. + /// + /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` + /// instead of an `Into<Input>`. + /// + /// # Example: specific pattern search + /// + /// This example shows how to build a multi-PikeVM that permits searching + /// for specific patterns. + /// + /// ``` + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Anchored, Match, PatternID, Input, + /// }; + /// + /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123"; + /// + /// // Since we are using the default leftmost-first match and both + /// // patterns match at the same starting position, only the first pattern + /// // will be returned in this case when doing a search for any of the + /// // patterns. + /// let expected = Some(Match::must(0, 0..6)); + /// re.search(&mut cache, &Input::new(haystack), &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we want to check whether some other pattern matches, then we + /// // can provide its pattern ID. + /// let expected = Some(Match::must(1, 0..6)); + /// let input = Input::new(haystack) + /// .anchored(Anchored::Pattern(PatternID::must(1))); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// # Example: specifying the bounds of a search + /// + /// This example shows how providing the bounds of a search can produce + /// different results than simply sub-slicing the haystack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; + /// + /// let re = PikeVM::new(r"\b[0-9]{3}\b")?; + /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); + /// let haystack = "foo123bar"; + /// + /// // Since we sub-slice the haystack, the search doesn't know about + /// // the larger context and assumes that `123` is surrounded by word + /// // boundaries. And of course, the match position is reported relative + /// // to the sub-slice as well, which means we get `0..3` instead of + /// // `3..6`. + /// let expected = Some(Match::must(0, 0..3)); + /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// // But if we provide the bounds of the search within the context of the + /// // entire haystack, then the search can take the surrounding context + /// // into account. (And if we did find a match, it would be reported + /// // as a valid offset into `haystack` instead of its sub-slice.) + /// let expected = None; + /// let input = Input::new(haystack).range(3..6); + /// re.search(&mut cache, &input, &mut caps); + /// assert_eq!(expected, caps.get_match()); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search( &self, cache: &mut Cache, - haystack: &[u8], - start: usize, - end: usize, + input: &Input<'_>, caps: &mut Captures, - ) -> Option<MultiMatch> { - let anchored = - self.config.get_anchored() || self.nfa.is_always_start_anchored(); - let mut at = start; - let mut matched_pid = None; - cache.clear(); - 'LOOP: loop { - if cache.clist.set.is_empty() { - if matched_pid.is_some() || (anchored && at > start) { - break 'LOOP; + ) { + caps.set_pattern(None); + let pid = self.search_slots(cache, input, caps.slots_mut()); + caps.set_pattern(pid); + } + + /// Executes a leftmost forward search and writes the spans of capturing + /// groups that participated in a match into the provided `slots`, and + /// returns the matching pattern ID. The contents of the slots for patterns + /// other than the matching pattern are unspecified. If no match was found, + /// then `None` is returned and the contents of `slots` is unspecified. + /// + /// This is like [`PikeVM::search`], but it accepts a raw slots slice + /// instead of a `Captures` value. This is useful in contexts where you + /// don't want or need to allocate a `Captures`. + /// + /// It is legal to pass _any_ number of slots to this routine. If the regex + /// engine would otherwise write a slot offset that doesn't fit in the + /// provided slice, then it is simply skipped. In general though, there are + /// usually three slice lengths you might want to use: + /// + /// * An empty slice, if you only care about which pattern matched. + /// * A slice with + /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) + /// slots, if you only care about the overall match spans for each matching + /// pattern. + /// * A slice with + /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which + /// permits recording match offsets for every capturing group in every + /// pattern. + /// + /// # Example + /// + /// This example shows how to find the overall match offsets in a + /// multi-pattern search without allocating a `Captures` value. Indeed, we + /// can put our slots right on the stack. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; + /// + /// let re = PikeVM::new_many(&[ + /// r"\pL+", + /// r"\d+", + /// ])?; + /// let mut cache = re.create_cache(); + /// let input = Input::new("!@#123"); + /// + /// // We only care about the overall match offsets here, so we just + /// // allocate two slots for each pattern. Each slot records the start + /// // and end of the match. + /// let mut slots = [None; 4]; + /// let pid = re.search_slots(&mut cache, &input, &mut slots); + /// assert_eq!(Some(PatternID::must(1)), pid); + /// + /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. + /// // See 'GroupInfo' for more details on the mapping between groups and + /// // slot indices. + /// let slot_start = pid.unwrap().as_usize() * 2; + /// let slot_end = slot_start + 1; + /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); + /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn search_slots( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + if !utf8empty { + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); + } + // There is an unfortunate special case where if the regex can + // match the empty string and UTF-8 mode is enabled, the search + // implementation requires that the slots have at least as much space + // to report the bounds of any match. This is so zero-width matches + // that split a codepoint can be filtered out. + // + // Note that if utf8empty is true, we specialize the case for when + // the number of patterns is 1. In that case, we can just use a stack + // allocation. Otherwise we resort to a heap allocation, which we + // convince ourselves we're fine with due to the pathological nature of + // this case. + let min = self.get_nfa().group_info().implicit_slot_len(); + if slots.len() >= min { + let hm = self.search_slots_imp(cache, input, slots)?; + return Some(hm.pattern()); + } + if self.get_nfa().pattern_len() == 1 { + let mut enough = [None, None]; + let got = self.search_slots_imp(cache, input, &mut enough); + // This is OK because we know `enough` is strictly bigger than + // `slots`, otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + return got.map(|hm| hm.pattern()); + } + let mut enough = vec![None; min]; + let got = self.search_slots_imp(cache, input, &mut enough); + // This is OK because we know `enough` is strictly bigger than `slots`, + // otherwise this special case isn't reached. + slots.copy_from_slice(&enough[..slots.len()]); + got.map(|hm| hm.pattern()) + } + + /// This is the actual implementation of `search_slots_imp` that + /// doesn't account for the special case when 1) the NFA has UTF-8 mode + /// enabled, 2) the NFA can match the empty string and 3) the caller has + /// provided an insufficient number of slots to record match offsets. + #[inline(never)] + fn search_slots_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let hm = match self.search_imp(cache, input, slots) { + None => return None, + Some(hm) if !utf8empty => return Some(hm), + Some(hm) => hm, + }; + empty::skip_splits_fwd(input, hm, hm.offset(), |input| { + Ok(self + .search_imp(cache, input, slots) + .map(|hm| (hm, hm.offset()))) + }) + // OK because the PikeVM never errors. + .unwrap() + } + + /// Writes the set of patterns that match anywhere in the given search + /// configuration to `patset`. If multiple patterns match at the same + /// position and this `PikeVM` was configured with [`MatchKind::All`] + /// semantics, then all matching patterns are written to the given set. + /// + /// Unless all of the patterns in this `PikeVM` are anchored, then + /// generally speaking, this will visit every byte in the haystack. + /// + /// This search routine *does not* clear the pattern set. This gives some + /// flexibility to the caller (e.g., running multiple searches with the + /// same pattern set), but does make the API bug-prone if you're reusing + /// the same pattern set for multiple searches but intended them to be + /// independent. + /// + /// If a pattern ID matched but the given `PatternSet` does not have + /// sufficient capacity to store it, then it is not inserted and silently + /// dropped. + /// + /// # Example + /// + /// This example shows how to find all matching patterns in a haystack, + /// even when some patterns match at the same position as other patterns. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{ + /// nfa::thompson::pikevm::PikeVM, + /// Input, MatchKind, PatternSet, + /// }; + /// + /// let patterns = &[ + /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", + /// ]; + /// let re = PikeVM::builder() + /// .configure(PikeVM::config().match_kind(MatchKind::All)) + /// .build_many(patterns)?; + /// let mut cache = re.create_cache(); + /// + /// let input = Input::new("foobar"); + /// let mut patset = PatternSet::new(re.pattern_len()); + /// re.which_overlapping_matches(&mut cache, &input, &mut patset); + /// let expected = vec![0, 2, 3, 4, 6]; + /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); + /// assert_eq!(expected, got); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn which_overlapping_matches( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + self.which_overlapping_imp(cache, input, patset) + } +} + +impl PikeVM { + /// The implementation of standard leftmost search. + /// + /// Capturing group spans are written to `slots`, but only if requested. + /// `slots` can be any length. Any slot in the NFA that is activated but + /// which is out of bounds for the given `slots` is ignored. + fn search_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<HalfMatch> { + cache.setup_search(slots.len()); + if input.is_done() { + return None; + } + // Why do we even care about this? Well, in our 'Captures' + // representation, we use usize::MAX as a sentinel to indicate "no + // match." This isn't problematic so long as our haystack doesn't have + // a maximal length. Byte slices are guaranteed by Rust to have a + // length that fits into isize, and so this assert should always pass. + // But we put it here to make our assumption explicit. + assert!( + input.haystack().len() < core::usize::MAX, + "byte slice lengths must be less than usize MAX", + ); + instrument!(|c| c.reset(&self.nfa)); + + // Whether we want to visit all match states instead of emulating the + // 'leftmost' semantics of typical backtracking regex engines. + let allmatches = + self.config.get_match_kind().continue_past_first_match(); + let (anchored, start_id) = match self.start_config(input) { + None => return None, + Some(config) => config, + }; + + let pre = + if anchored { None } else { self.get_config().get_prefilter() }; + let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + let mut hm = None; + // Yes, our search doesn't end at input.end(), but includes it. This + // is necessary because matches are delayed by one byte, just like + // how the DFA engines work. The delay is used to handle look-behind + // assertions. In the case of the PikeVM, the delay is implemented + // by not considering a match to exist until it is visited in + // 'steps'. Technically, we know a match exists in the previous + // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA + // determinization. We don't mark a DFA state as a match state if it + // contains an NFA match state, but rather, whether the DFA state was + // generated by a transition from a DFA state that contains an NFA + // match state.) + let mut at = input.start(); + while at <= input.end() { + // If we have no states left to visit, then there are some cases + // where we know we can quit early or even skip ahead. + if curr.set.is_empty() { + // We have a match and we haven't been instructed to continue + // on even after finding a match, so we can quit. + if hm.is_some() && !allmatches { + break; + } + // If we're running an anchored search and we've advanced + // beyond the start position with no other states to try, then + // we will never observe a match and thus can stop. + if anchored && at > input.start() { + break; + } + // If there no states left to explore at this position and we + // know we can't terminate early, then we are effectively at + // the starting state of the NFA. If we fell through here, + // we'd end up adding our '(?s-u:.)*?' prefix and it would be + // the only thing in 'curr'. So we might as well just skip + // ahead until we find something that we know might advance us + // forward. + if let Some(ref pre) = pre { + let span = Span::from(at..input.end()); + match pre.find(input.haystack(), span) { + None => break, + Some(ref span) => at = span.start, + } } - // TODO: prefilter } - if (!anchored && matched_pid.is_none()) - || cache.clist.set.is_empty() + // Instead of using the NFA's unanchored start state, we actually + // always use its anchored starting state. As a result, when doing + // an unanchored search, we need to simulate our own '(?s-u:.)*?' + // prefix, to permit a match to appear anywhere. + // + // Now, we don't *have* to do things this way. We could use the + // NFA's unanchored starting state and do one 'epsilon_closure' + // call from that starting state before the main loop here. And + // that is just as correct. However, it turns out to be slower + // than our approach here because it slightly increases the cost + // of processing each byte by requiring us to visit more NFA + // states to deal with the additional NFA states in the unanchored + // prefix. By simulating it explicitly here, we lower those costs + // substantially. The cost is itself small, but it adds up for + // large haystacks. + // + // In order to simulate the '(?s-u:.)*?' prefix---which is not + // greedy---we are careful not to perform an epsilon closure on + // the start state if we already have a match. Namely, if we + // did otherwise, we would never reach a terminating condition + // because there would always be additional states to process. + // In effect, the exclusion of running 'epsilon_closure' when + // we have a match corresponds to the "dead" states we have in + // our DFA regex engines. Namely, in a DFA, match states merely + // instruct the search execution to record the current offset as + // the most recently seen match. It is the dead state that actually + // indicates when to stop the search (other than EOF or quit + // states). + // + // However, when 'allmatches' is true, the caller has asked us to + // leave in every possible match state. This tends not to make a + // whole lot of sense in unanchored searches, because it means the + // search really cannot terminate until EOF. And often, in that + // case, you wind up skipping over a bunch of matches and are left + // with the "last" match. Arguably, it just doesn't make a lot of + // sense to run a 'leftmost' search (which is what this routine is) + // with 'allmatches' set to true. But the DFAs support it and this + // matches their behavior. (Generally, 'allmatches' is useful for + // overlapping searches or leftmost anchored searches to find the + // longest possible match by ignoring match priority.) + // + // Additionally, when we're running an anchored search, this + // epsilon closure should only be computed at the beginning of the + // search. If we re-computed it at every position, we would be + // simulating an unanchored search when we were tasked to perform + // an anchored search. + if (!hm.is_some() || allmatches) + && (!anchored || at == input.start()) { - self.epsilon_closure( - &mut cache.clist, - &mut caps.slots, - &mut cache.stack, - self.nfa.start_anchored(), - haystack, - at, - ); + // Since we are adding to the 'curr' active states and since + // this is for the start ID, we use a slots slice that is + // guaranteed to have the right length but where every element + // is absent. This is exactly what we want, because this + // epsilon closure is responsible for simulating an unanchored + // '(?s:.)*?' prefix. It is specifically outside of any + // capturing groups, and thus, using slots that are always + // absent is correct. + // + // Note though that we can't just use '&mut []' here, since + // this epsilon closure may traverse through 'Captures' epsilon + // transitions, and thus must be able to write offsets to the + // slots given which are later copied to slot values in 'curr'. + let slots = next.slot_table.all_absent(); + self.epsilon_closure(stack, slots, curr, input, at, start_id); } - for i in 0..cache.clist.set.len() { - let sid = cache.clist.set.get(i); - let pid = match self.step( - &mut cache.nlist, - &mut caps.slots, - cache.clist.caps(sid), - &mut cache.stack, - sid, - haystack, - at, - ) { - None => continue, - Some(pid) => pid, - }; - matched_pid = Some(pid); - break; + if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) + { + hm = Some(HalfMatch::new(pid, at)); } - if at >= end { + // Unless the caller asked us to return early, we need to mush on + // to see if we can extend our match. (But note that 'nexts' will + // quit right after seeing a match when match_kind==LeftmostFirst, + // as is consistent with leftmost-first match priority.) + if input.get_earliest() && hm.is_some() { break; } + core::mem::swap(curr, next); + next.set.clear(); at += 1; - cache.swap(); - cache.nlist.set.clear(); } - matched_pid.map(|pid| { - let slots = self.nfa.pattern_slots(pid); - let (start, end) = (slots.start, slots.start + 1); - MultiMatch::new( - pid, - caps.slots[start].unwrap(), - caps.slots[end].unwrap(), - ) - }) + instrument!(|c| c.eprint(&self.nfa)); + hm + } + + /// The implementation for the 'which_overlapping_matches' API. Basically, + /// we do a single scan through the entire haystack (unless our regex + /// or search is anchored) and record every pattern that matched. In + /// particular, when MatchKind::All is used, this supports overlapping + /// matches. So if we have the regexes 'sam' and 'samwise', they will + /// *both* be reported in the pattern set when searching the haystack + /// 'samwise'. + fn which_overlapping_imp( + &self, + cache: &mut Cache, + input: &Input<'_>, + patset: &mut PatternSet, + ) { + // NOTE: This is effectively a copy of 'search_imp' above, but with no + // captures support and instead writes patterns that matched directly + // to 'patset'. See that routine for better commentary about what's + // going on in this routine. We probably could unify the routines using + // generics or more helper routines, but I'm not sure it's worth it. + // + // NOTE: We somewhat go out of our way here to support things like + // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither + // of those seem particularly relevant to this routine, but they are + // both supported by the DFA analogs of this routine by construction + // and composition, so it seems like good sense to have the PikeVM + // match that behavior. + + cache.setup_search(0); + if input.is_done() { + return; + } + assert!( + input.haystack().len() < core::usize::MAX, + "byte slice lengths must be less than usize MAX", + ); + instrument!(|c| c.reset(&self.nfa)); + + let allmatches = + self.config.get_match_kind().continue_past_first_match(); + let (anchored, start_id) = match self.start_config(input) { + None => return, + Some(config) => config, + }; + + let Cache { ref mut stack, ref mut curr, ref mut next } = cache; + for at in input.start()..=input.end() { + let any_matches = !patset.is_empty(); + if curr.set.is_empty() { + if any_matches && !allmatches { + break; + } + if anchored && at > input.start() { + break; + } + } + if !any_matches || allmatches { + let slots = &mut []; + self.epsilon_closure(stack, slots, curr, input, at, start_id); + } + self.nexts_overlapping(stack, curr, next, input, at, patset); + // If we found a match and filled our set, then there is no more + // additional info that we can provide. Thus, we can quit. We also + // quit if the caller asked us to stop at the earliest point that + // we know a match exists. + if patset.is_full() || input.get_earliest() { + break; + } + core::mem::swap(curr, next); + next.set.clear(); + } + instrument!(|c| c.eprint(&self.nfa)); + } + + /// Process the active states in 'curr' to find the states (written to + /// 'next') we should process for the next byte in the haystack. + /// + /// 'stack' is used to perform a depth first traversal of the NFA when + /// computing an epsilon closure. + /// + /// When a match is found, the slots for that match state (in 'curr') are + /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' + /// stops (unless the PikeVM was configured with MatchKind::All semantics). + #[cfg_attr(feature = "perf-inline", inline(always))] + fn nexts( + &self, + stack: &mut Vec<FollowEpsilon>, + curr: &mut ActiveStates, + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + slots: &mut [Option<NonMaxUsize>], + ) -> Option<PatternID> { + instrument!(|c| c.record_state_set(&curr.set)); + let mut pid = None; + let ActiveStates { ref set, ref mut slot_table } = *curr; + for sid in set.iter() { + pid = match self.next(stack, slot_table, next, input, at, sid) { + None => continue, + Some(pid) => Some(pid), + }; + slots.copy_from_slice(slot_table.for_state(sid)); + if !self.config.get_match_kind().continue_past_first_match() { + break; + } + } + pid } - #[inline(always)] - fn step( + /// Like 'nexts', but for the overlapping case. This doesn't write any + /// slots, and instead just writes which pattern matched in 'patset'. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn nexts_overlapping( &self, - nlist: &mut Threads, - slots: &mut [Slot], - thread_caps: &mut [Slot], stack: &mut Vec<FollowEpsilon>, - sid: StateID, - haystack: &[u8], + curr: &mut ActiveStates, + next: &mut ActiveStates, + input: &Input<'_>, at: usize, + patset: &mut PatternSet, + ) { + instrument!(|c| c.record_state_set(&curr.set)); + let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); + let ActiveStates { ref set, ref mut slot_table } = *curr; + for sid in set.iter() { + let pid = match self.next(stack, slot_table, next, input, at, sid) + { + None => continue, + Some(pid) => pid, + }; + // This handles the case of finding a zero-width match that splits + // a codepoint. Namely, if we're in UTF-8 mode AND we know we can + // match the empty string, then the only valid way of getting to + // this point with an offset that splits a codepoint is when we + // have an empty match. Such matches, in UTF-8 mode, must not be + // reported. So we just skip them here and pretend as if we did + // not see a match. + if utf8empty && !input.is_char_boundary(at) { + continue; + } + let _ = patset.try_insert(pid); + if !self.config.get_match_kind().continue_past_first_match() { + break; + } + } + } + + /// Starting from 'sid', if the position 'at' in the 'input' haystack has a + /// transition defined out of 'sid', then add the state transitioned to and + /// its epsilon closure to the 'next' set of states to explore. + /// + /// 'stack' is used by the epsilon closure computation to perform a depth + /// first traversal of the NFA. + /// + /// 'curr_slot_table' should be the table of slots for the current set of + /// states being explored. If there is a transition out of 'sid', then + /// sid's row in the slot table is used to perform the epsilon closure. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn next( + &self, + stack: &mut Vec<FollowEpsilon>, + curr_slot_table: &mut SlotTable, + next: &mut ActiveStates, + input: &Input<'_>, + at: usize, + sid: StateID, ) -> Option<PatternID> { + instrument!(|c| c.record_step(sid)); match *self.nfa.state(sid) { State::Fail | State::Look { .. } | State::Union { .. } + | State::BinaryUnion { .. } | State::Capture { .. } => None, - State::Range { ref range } => { - if range.matches(haystack, at) { + State::ByteRange { ref trans } => { + if trans.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); self.epsilon_closure( - nlist, - thread_caps, - stack, - range.next, - haystack, - at + 1, + stack, slots, next, input, at, trans.next, ); } None } State::Sparse(ref sparse) => { - if let Some(next) = sparse.matches(haystack, at) { + if let Some(next_sid) = sparse.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); self.epsilon_closure( - nlist, - thread_caps, - stack, - next, - haystack, - at + 1, + stack, slots, next, input, at, next_sid, ); } None } - State::Match { id } => { - slots.copy_from_slice(thread_caps); - Some(id) + State::Dense(ref dense) => { + if let Some(next_sid) = dense.matches(input.haystack(), at) { + let slots = curr_slot_table.for_state(sid); + // OK because 'at <= haystack.len() < usize::MAX', so + // adding 1 will never wrap. + let at = at.wrapping_add(1); + self.epsilon_closure( + stack, slots, next, input, at, next_sid, + ); + } + None } + State::Match { pattern_id } => Some(pattern_id), } } - #[inline(always)] + /// Compute the epsilon closure of 'sid', writing the closure into 'next' + /// while copying slot values from 'curr_slots' into corresponding states + /// in 'next'. 'curr_slots' should be the slot values corresponding to + /// 'sid'. + /// + /// The given 'stack' is used to perform a depth first traversal of the + /// NFA by recursively following all epsilon transitions out of 'sid'. + /// Conditional epsilon transitions are followed if and only if they are + /// satisfied for the position 'at' in the 'input' haystack. + /// + /// While this routine may write to 'curr_slots', once it returns, any + /// writes are undone and the original values (even if absent) are + /// restored. + #[cfg_attr(feature = "perf-inline", inline(always))] fn epsilon_closure( &self, - nlist: &mut Threads, - thread_caps: &mut [Slot], stack: &mut Vec<FollowEpsilon>, - sid: StateID, - haystack: &[u8], + curr_slots: &mut [Option<NonMaxUsize>], + next: &mut ActiveStates, + input: &Input<'_>, at: usize, + sid: StateID, ) { - stack.push(FollowEpsilon::StateID(sid)); + instrument!(|c| { + c.record_closure(sid); + c.record_stack_push(sid); + }); + stack.push(FollowEpsilon::Explore(sid)); while let Some(frame) = stack.pop() { match frame { - FollowEpsilon::StateID(sid) => { - self.epsilon_closure_step( - nlist, - thread_caps, - stack, - sid, - haystack, - at, - ); + FollowEpsilon::RestoreCapture { slot, offset: pos } => { + curr_slots[slot] = pos; } - FollowEpsilon::Capture { slot, pos } => { - thread_caps[slot] = pos; + FollowEpsilon::Explore(sid) => { + self.epsilon_closure_explore( + stack, curr_slots, next, input, at, sid, + ); } } } } - #[inline(always)] - fn epsilon_closure_step( + /// Explore all of the epsilon transitions out of 'sid'. This is mostly + /// split out from 'epsilon_closure' in order to clearly delineate + /// the actual work of computing an epsilon closure from the stack + /// book-keeping. + /// + /// This will push any additional explorations needed on to 'stack'. + /// + /// 'curr_slots' should refer to the slots for the currently active NFA + /// state. That is, the current state we are stepping through. These + /// slots are mutated in place as new 'Captures' states are traversed + /// during epsilon closure, but the slots are restored to their original + /// values once the full epsilon closure is completed. The ultimate use of + /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that + /// the capturing group spans are forwarded from the currently active state + /// to the next. + /// + /// 'next' refers to the next set of active states. Computing an epsilon + /// closure may increase the next set of active states. + /// + /// 'input' refers to the caller's input configuration and 'at' refers to + /// the current position in the haystack. These are used to check whether + /// conditional epsilon transitions (like look-around) are satisfied at + /// the current position. If they aren't, then the epsilon closure won't + /// include them. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn epsilon_closure_explore( &self, - nlist: &mut Threads, - thread_caps: &mut [Slot], stack: &mut Vec<FollowEpsilon>, - mut sid: StateID, - haystack: &[u8], + curr_slots: &mut [Option<NonMaxUsize>], + next: &mut ActiveStates, + input: &Input<'_>, at: usize, + mut sid: StateID, ) { + // We can avoid pushing some state IDs on to our stack in precisely + // the cases where a 'push(x)' would be immediately followed by a 'x + // = pop()'. This is achieved by this outer-loop. We simply set 'sid' + // to be the next state ID we want to explore once we're done with + // our initial exploration. In practice, this avoids a lot of stack + // thrashing. loop { - if !nlist.set.insert(sid) { + instrument!(|c| c.record_set_insert(sid)); + // Record this state as part of our next set of active states. If + // we've already explored it, then no need to do it again. + if !next.set.insert(sid) { return; } match *self.nfa.state(sid) { State::Fail - | State::Range { .. } + | State::Match { .. } + | State::ByteRange { .. } | State::Sparse { .. } - | State::Match { .. } => { - let t = &mut nlist.caps(sid); - t.copy_from_slice(thread_caps); + | State::Dense { .. } => { + next.slot_table.for_state(sid).copy_from_slice(curr_slots); return; } State::Look { look, next } => { - if !look.matches(haystack, at) { + // OK because we don't permit building a searcher with a + // Unicode word boundary if the requisite Unicode data is + // unavailable. + if !self.nfa.look_matcher().matches_inline( + look, + input.haystack(), + at, + ) { return; } sid = next; @@ -381,174 +1710,650 @@ impl PikeVM { None => return, Some(&sid) => sid, }; + instrument!(|c| { + for &alt in &alternates[1..] { + c.record_stack_push(alt); + } + }); stack.extend( alternates[1..] .iter() .copied() .rev() - .map(FollowEpsilon::StateID), + .map(FollowEpsilon::Explore), ); } - State::Capture { next, slot } => { - if slot < thread_caps.len() { - stack.push(FollowEpsilon::Capture { + State::BinaryUnion { alt1, alt2 } => { + sid = alt1; + instrument!(|c| c.record_stack_push(sid)); + stack.push(FollowEpsilon::Explore(alt2)); + } + State::Capture { next, slot, .. } => { + // There's no need to do anything with slots that + // ultimately won't be copied into the caller-provided + // 'Captures' value. So we just skip dealing with them at + // all. + if slot.as_usize() < curr_slots.len() { + instrument!(|c| c.record_stack_push(sid)); + stack.push(FollowEpsilon::RestoreCapture { slot, - pos: thread_caps[slot], + offset: curr_slots[slot], }); - thread_caps[slot] = Some(at); + // OK because length of a slice must fit into an isize. + curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); } sid = next; } } } } + + /// Return the starting configuration of a PikeVM search. + /// + /// The "start config" is basically whether the search should be anchored + /// or not and the NFA state ID at which to begin the search. The state ID + /// returned always corresponds to an anchored starting state even when the + /// search is unanchored. This is because the PikeVM search loop deals with + /// unanchored searches with an explicit epsilon closure out of the start + /// state. + /// + /// This routine accounts for both the caller's `Input` configuration + /// and the pattern itself. For example, even if the caller asks for an + /// unanchored search, if the pattern itself is anchored, then this will + /// always return 'true' because implementing an unanchored search in that + /// case would be incorrect. + /// + /// Similarly, if the caller requests an anchored search for a particular + /// pattern, then the starting state ID returned will reflect that. + /// + /// If a pattern ID is given in the input configuration that is not in + /// this regex, then `None` is returned. + fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { + match input.get_anchored() { + // Only way we're unanchored is if both the caller asked for an + // unanchored search *and* the pattern is itself not anchored. + Anchored::No => Some(( + self.nfa.is_always_start_anchored(), + self.nfa.start_anchored(), + )), + Anchored::Yes => Some((true, self.nfa.start_anchored())), + Anchored::Pattern(pid) => { + Some((true, self.nfa.start_pattern(pid)?)) + } + } + } } -/// An iterator over all non-overlapping leftmost matches for a particular -/// infallible search. +/// An iterator over all non-overlapping matches for a particular search. /// -/// The iterator yields a [`MultiMatch`] value until no more matches could be -/// found. If the underlying search returns an error, then this panics. +/// The iterator yields a [`Match`] value until no more matches could be found. /// -/// The lifetime variables are as follows: +/// The lifetime parameters are as follows: /// -/// * `'r` is the lifetime of the regular expression itself. -/// * `'c` is the lifetime of the mutable cache used during search. -/// * `'t` is the lifetime of the text being searched. +/// * `'r` represents the lifetime of the PikeVM. +/// * `'c` represents the lifetime of the PikeVM's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`PikeVM::find_iter`] method. #[derive(Debug)] -pub struct FindLeftmostMatches<'r, 'c, 't> { - vm: &'r PikeVM, +pub struct FindMatches<'r, 'c, 'h> { + re: &'r PikeVM, cache: &'c mut Cache, - // scanner: Option<prefilter::Scanner<'r>>, - text: &'t [u8], - last_end: usize, - last_match: Option<usize>, + caps: Captures, + it: iter::Searcher<'h>, } -impl<'r, 'c, 't> FindLeftmostMatches<'r, 'c, 't> { - fn new( - vm: &'r PikeVM, - cache: &'c mut Cache, - text: &'t [u8], - ) -> FindLeftmostMatches<'r, 'c, 't> { - FindLeftmostMatches { vm, cache, text, last_end: 0, last_match: None } +impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { + type Item = Match; + + #[inline] + fn next(&mut self) -> Option<Match> { + // Splitting 'self' apart seems necessary to appease borrowck. + let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + // 'advance' converts errors into panics, which is OK here because + // the PikeVM can never return an error. + it.advance(|input| { + re.search(cache, input, caps); + Ok(caps.get_match()) + }) } } -impl<'r, 'c, 't> Iterator for FindLeftmostMatches<'r, 'c, 't> { - // type Item = Captures; - type Item = MultiMatch; +/// An iterator over all non-overlapping leftmost matches, with their capturing +/// groups, for a particular search. +/// +/// The iterator yields a [`Captures`] value until no more matches could be +/// found. +/// +/// The lifetime parameters are as follows: +/// +/// * `'r` represents the lifetime of the PikeVM. +/// * `'c` represents the lifetime of the PikeVM's cache. +/// * `'h` represents the lifetime of the haystack being searched. +/// +/// This iterator can be created with the [`PikeVM::captures_iter`] method. +#[derive(Debug)] +pub struct CapturesMatches<'r, 'c, 'h> { + re: &'r PikeVM, + cache: &'c mut Cache, + caps: Captures, + it: iter::Searcher<'h>, +} - // fn next(&mut self) -> Option<Captures> { - fn next(&mut self) -> Option<MultiMatch> { - if self.last_end > self.text.len() { - return None; - } - let mut caps = self.vm.create_captures(); - let m = self.vm.find_leftmost_at( - self.cache, - self.text, - self.last_end, - self.text.len(), - &mut caps, - )?; - if m.is_empty() { - // This is an empty match. To ensure we make progress, start - // the next search at the smallest possible starting position - // of the next match following this one. - self.last_end = if self.vm.config.get_utf8() { - crate::util::next_utf8(self.text, m.end()) - } else { - m.end() + 1 - }; - // Don't accept empty matches immediately following a match. - // Just move on to the next match. - if Some(m.end()) == self.last_match { - return self.next(); - } +impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { + type Item = Captures; + + #[inline] + fn next(&mut self) -> Option<Captures> { + // Splitting 'self' apart seems necessary to appease borrowck. + let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = + *self; + // 'advance' converts errors into panics, which is OK here because + // the PikeVM can never return an error. + it.advance(|input| { + re.search(cache, input, caps); + Ok(caps.get_match()) + }); + if caps.is_match() { + Some(caps.clone()) } else { - self.last_end = m.end(); + None } - self.last_match = Some(m.end()); - Some(m) } } +/// A cache represents mutable state that a [`PikeVM`] requires during a +/// search. +/// +/// For a given [`PikeVM`], its corresponding cache may be created either via +/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in +/// every way, except the former does not require explicitly importing `Cache`. +/// +/// A particular `Cache` is coupled with the [`PikeVM`] from which it +/// was created. It may only be used with that `PikeVM`. A cache and its +/// allocations may be re-purposed via [`Cache::reset`], in which case, it can +/// only be used with the new `PikeVM` (and not the old one). #[derive(Clone, Debug)] -pub struct Captures { - slots: Vec<Slot>, +pub struct Cache { + /// Stack used while computing epsilon closure. This effectively lets us + /// move what is more naturally expressed through recursion to a stack + /// on the heap. + stack: Vec<FollowEpsilon>, + /// The current active states being explored for the current byte in the + /// haystack. + curr: ActiveStates, + /// The next set of states we're building that will be explored for the + /// next byte in the haystack. + next: ActiveStates, } -impl Captures { - pub fn new(nfa: &NFA) -> Captures { - Captures { slots: vec![None; nfa.capture_slot_len()] } +impl Cache { + /// Create a new [`PikeVM`] cache. + /// + /// A potentially more convenient routine to create a cache is + /// [`PikeVM::create_cache`], as it does not require also importing the + /// `Cache` type. + /// + /// If you want to reuse the returned `Cache` with some other `PikeVM`, + /// then you must call [`Cache::reset`] with the desired `PikeVM`. + pub fn new(re: &PikeVM) -> Cache { + Cache { + stack: vec![], + curr: ActiveStates::new(re), + next: ActiveStates::new(re), + } + } + + /// Reset this cache such that it can be used for searching with a + /// different [`PikeVM`]. + /// + /// A cache reset permits reusing memory already allocated in this cache + /// with a different `PikeVM`. + /// + /// # Example + /// + /// This shows how to re-purpose a cache for use with a different `PikeVM`. + /// + /// ``` + /// # if cfg!(miri) { return Ok(()); } // miri takes too long + /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; + /// + /// let re1 = PikeVM::new(r"\w")?; + /// let re2 = PikeVM::new(r"\W")?; + /// + /// let mut cache = re1.create_cache(); + /// assert_eq!( + /// Some(Match::must(0, 0..2)), + /// re1.find_iter(&mut cache, "Δ").next(), + /// ); + /// + /// // Using 'cache' with re2 is not allowed. It may result in panics or + /// // incorrect results. In order to re-purpose the cache, we must reset + /// // it with the PikeVM we'd like to use it with. + /// // + /// // Similarly, after this reset, using the cache with 're1' is also not + /// // allowed. + /// cache.reset(&re2); + /// assert_eq!( + /// Some(Match::must(0, 0..3)), + /// re2.find_iter(&mut cache, "☃").next(), + /// ); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + pub fn reset(&mut self, re: &PikeVM) { + self.curr.reset(re); + self.next.reset(re); + } + + /// Returns the heap memory usage, in bytes, of this cache. + /// + /// This does **not** include the stack size used up by this cache. To + /// compute that, use `std::mem::size_of::<Cache>()`. + pub fn memory_usage(&self) -> usize { + use core::mem::size_of; + (self.stack.len() * size_of::<FollowEpsilon>()) + + self.curr.memory_usage() + + self.next.memory_usage() + } + + /// Clears this cache. This should be called at the start of every search + /// to ensure we start with a clean slate. + /// + /// This also sets the length of the capturing groups used in the current + /// search. This permits an optimization where by 'SlotTable::for_state' + /// only returns the number of slots equivalent to the number of slots + /// given in the 'Captures' value. This may be less than the total number + /// of possible slots, e.g., when one only wants to track overall match + /// offsets. This in turn permits less copying of capturing group spans + /// in the PikeVM. + fn setup_search(&mut self, captures_slot_len: usize) { + self.stack.clear(); + self.curr.setup_search(captures_slot_len); + self.next.setup_search(captures_slot_len); } } +/// A set of active states used to "simulate" the execution of an NFA via the +/// PikeVM. +/// +/// There are two sets of these used during NFA simulation. One set corresponds +/// to the "current" set of states being traversed for the current position +/// in a haystack. The other set corresponds to the "next" set of states being +/// built, which will become the new "current" set for the next position in the +/// haystack. These two sets correspond to CLIST and NLIST in Thompson's +/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 +/// +/// In addition to representing a set of NFA states, this also maintains slot +/// values for each state. These slot values are what turn the NFA simulation +/// into the "Pike VM." Namely, they track capturing group values for each +/// state. During the computation of epsilon closure, we copy slot values from +/// states in the "current" set to the "next" set. Eventually, once a match +/// is found, the slot values for that match state are what we write to the +/// caller provided 'Captures' value. #[derive(Clone, Debug)] -pub struct Cache { - stack: Vec<FollowEpsilon>, - clist: Threads, - nlist: Threads, +struct ActiveStates { + /// The set of active NFA states. This set preserves insertion order, which + /// is critical for simulating the match semantics of backtracking regex + /// engines. + set: SparseSet, + /// The slots for every NFA state, where each slot stores a (possibly + /// absent) offset. Every capturing group has two slots. One for a start + /// offset and one for an end offset. + slot_table: SlotTable, } -type Slot = Option<usize>; +impl ActiveStates { + /// Create a new set of active states for the given PikeVM. The active + /// states returned may only be used with the given PikeVM. (Use 'reset' + /// to re-purpose the allocation for a different PikeVM.) + fn new(re: &PikeVM) -> ActiveStates { + let mut active = ActiveStates { + set: SparseSet::new(0), + slot_table: SlotTable::new(), + }; + active.reset(re); + active + } + + /// Reset this set of active states such that it can be used with the given + /// PikeVM (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + self.set.resize(re.get_nfa().states().len()); + self.slot_table.reset(re); + } + + /// Return the heap memory usage, in bytes, used by this set of active + /// states. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.set.memory_usage() + self.slot_table.memory_usage() + } + /// Setup this set of active states for a new search. The given slot + /// length should be the number of slots in a caller provided 'Captures' + /// (and may be zero). + fn setup_search(&mut self, captures_slot_len: usize) { + self.set.clear(); + self.slot_table.setup_search(captures_slot_len); + } +} + +/// A table of slots, where each row represent a state in an NFA. Thus, the +/// table has room for storing slots for every single state in an NFA. +/// +/// This table is represented with a single contiguous allocation. In general, +/// the notion of "capturing group" doesn't really exist at this level of +/// abstraction, hence the name "slot" instead. (Indeed, every capturing group +/// maps to a pair of slots, one for the start offset and one for the end +/// offset.) Slots are indexed by the 'Captures' NFA state. +/// +/// N.B. Not every state actually needs a row of slots. Namely, states that +/// only have epsilon transitions currently never have anything written to +/// their rows in this table. Thus, the table is somewhat wasteful in its heap +/// usage. However, it is important to maintain fast random access by state +/// ID, which means one giant table tends to work well. RE2 takes a different +/// approach here and allocates each row as its own reference counted thing. +/// I explored such a strategy at one point here, but couldn't get it to work +/// well using entirely safe code. (To the ambitious reader: I encourage you to +/// re-litigate that experiment.) I very much wanted to stick to safe code, but +/// could be convinced otherwise if there was a solid argument and the safety +/// was encapsulated well. #[derive(Clone, Debug)] -struct Threads { - set: SparseSet, - caps: Vec<Slot>, - slots_per_thread: usize, +struct SlotTable { + /// The actual table of offsets. + table: Vec<Option<NonMaxUsize>>, + /// The number of slots per state, i.e., the table's stride or the length + /// of each row. + slots_per_state: usize, + /// The number of slots in the caller-provided 'Captures' value for the + /// current search. Setting this to 'slots_per_state' is always correct, + /// but may be wasteful. + slots_for_captures: usize, +} + +impl SlotTable { + /// Create a new slot table. + /// + /// One should call 'reset' with the corresponding PikeVM before use. + fn new() -> SlotTable { + SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } + } + + /// Reset this slot table such that it can be used with the given PikeVM + /// (and only that PikeVM). + fn reset(&mut self, re: &PikeVM) { + let nfa = re.get_nfa(); + self.slots_per_state = nfa.group_info().slot_len(); + // This is always correct, but may be reduced for a particular search + // if a 'Captures' has fewer slots, e.g., none at all or only slots + // for tracking the overall match instead of all slots for every + // group. + self.slots_for_captures = core::cmp::max( + self.slots_per_state, + nfa.pattern_len().checked_mul(2).unwrap(), + ); + let len = nfa + .states() + .len() + .checked_mul(self.slots_per_state) + // Add space to account for scratch space used during a search. + .and_then(|x| x.checked_add(self.slots_for_captures)) + // It seems like this could actually panic on legitimate inputs on + // 32-bit targets, and very likely to panic on 16-bit. Should we + // somehow convert this to an error? What about something similar + // for the lazy DFA cache? If you're tripping this assert, please + // file a bug. + .expect("slot table length doesn't overflow"); + // This happens about as often as a regex is compiled, so it probably + // should be at debug level, but I found it quite distracting and not + // particularly useful. + trace!( + "resizing PikeVM active states table to {} entries \ + (slots_per_state={})", + len, + self.slots_per_state, + ); + self.table.resize(len, None); + } + + /// Return the heap memory usage, in bytes, used by this slot table. + /// + /// This does not include the stack size of this value. + fn memory_usage(&self) -> usize { + self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>() + } + + /// Perform any per-search setup for this slot table. + /// + /// In particular, this sets the length of the number of slots used in the + /// 'Captures' given by the caller (if any at all). This number may be + /// smaller than the total number of slots available, e.g., when the caller + /// is only interested in tracking the overall match and not the spans of + /// every matching capturing group. Only tracking the overall match can + /// save a substantial amount of time copying capturing spans during a + /// search. + fn setup_search(&mut self, captures_slot_len: usize) { + self.slots_for_captures = captures_slot_len; + } + + /// Return a mutable slice of the slots for the given state. + /// + /// Note that the length of the slice returned may be less than the total + /// number of slots available for this state. In particular, the length + /// always matches the number of slots indicated via 'setup_search'. + fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { + let i = sid.as_usize() * self.slots_per_state; + &mut self.table[i..i + self.slots_for_captures] + } + + /// Return a slice of slots of appropriate length where every slot offset + /// is guaranteed to be absent. This is useful in cases where you need to + /// compute an epsilon closure outside of the user supplied regex, and thus + /// never want it to have any capturing slots set. + fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { + let i = self.table.len() - self.slots_for_captures; + &mut self.table[i..i + self.slots_for_captures] + } } +/// Represents a stack frame for use while computing an epsilon closure. +/// +/// (An "epsilon closure" refers to the set of reachable NFA states from a +/// single state without consuming any input. That is, the set of all epsilon +/// transitions not only from that single state, but from every other state +/// reachable by an epsilon transition as well. This is why it's called a +/// "closure." Computing an epsilon closure is also done during DFA +/// determinization! Compare and contrast the epsilon closure here in this +/// PikeVM and the one used for determinization in crate::util::determinize.) +/// +/// Computing the epsilon closure in a Thompson NFA proceeds via a depth +/// first traversal over all epsilon transitions from a particular state. +/// (A depth first traversal is important because it emulates the same priority +/// of matches that is typically found in backtracking regex engines.) This +/// depth first traversal is naturally expressed using recursion, but to avoid +/// a call stack size proportional to the size of a regex, we put our stack on +/// the heap instead. +/// +/// This stack thus consists of call frames. The typical call frame is +/// `Explore`, which instructs epsilon closure to explore the epsilon +/// transitions from that state. (Subsequent epsilon transitions are then +/// pushed on to the stack as more `Explore` frames.) If the state ID being +/// explored has no epsilon transitions, then the capturing group slots are +/// copied from the original state that sparked the epsilon closure (from the +/// 'step' routine) to the state ID being explored. This way, capturing group +/// slots are forwarded from the previous state to the next. +/// +/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to +/// set the position for a particular slot back to some particular offset. This +/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will +/// set the offset of the slot indicated in `Capture` to the current offset, +/// and then push the old offset on to the stack as a `RestoreCapture` frame. +/// Thus, the new offset is only used until the epsilon closure reverts back to +/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon +/// transition its "scope" to only states that come "after" it during depth +/// first traversal. #[derive(Clone, Debug)] enum FollowEpsilon { - StateID(StateID), - Capture { slot: usize, pos: Slot }, + /// Explore the epsilon transitions from a state ID. + Explore(StateID), + /// Reset the given `slot` to the given `offset` (which might be `None`). + RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, } -impl Cache { - pub fn new(nfa: &NFA) -> Cache { - Cache { - stack: vec![], - clist: Threads::new(nfa), - nlist: Threads::new(nfa), +/// A set of counters that "instruments" a PikeVM search. To enable this, you +/// must enable the 'internal-instrument-pikevm' feature. Then run your Rust +/// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in +/// the environment. The metrics collected will be dumped automatically for +/// every search executed by the PikeVM. +/// +/// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an +/// absolute decrease in wall-clock performance, even if the 'trace' log level +/// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't +/// enabled.) The main point of instrumentation is to get counts of various +/// events that occur during the PikeVM's execution. +/// +/// This is a somewhat hacked together collection of metrics that are useful +/// to gather from a PikeVM search. In particular, it lets us scrutinize the +/// performance profile of a search beyond what general purpose profiling tools +/// give us. Namely, we orient the profiling data around the specific states of +/// the NFA. +/// +/// In other words, this lets us see which parts of the NFA graph are most +/// frequently activated. This then provides direction for optimization +/// opportunities. +/// +/// The really sad part about this is that it absolutely clutters up the PikeVM +/// implementation. :'( Another approach would be to just manually add this +/// code in whenever I want this kind of profiling data, but it's complicated +/// and tedious enough that I went with this approach... for now. +/// +/// When instrumentation is enabled (which also turns on 'logging'), then a +/// `Counters` is initialized for every search and `trace`'d just before the +/// search returns to the caller. +/// +/// Tip: When debugging performance problems with the PikeVM, it's best to try +/// to work with an NFA that is as small as possible. Otherwise the state graph +/// is likely to be too big to digest. +#[cfg(feature = "internal-instrument-pikevm")] +#[derive(Clone, Debug)] +struct Counters { + /// The number of times the NFA is in a particular permutation of states. + state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>, + /// The number of times 'step' is called for a particular state ID (which + /// indexes this array). + steps: Vec<u64>, + /// The number of times an epsilon closure was computed for a state. + closures: Vec<u64>, + /// The number of times a particular state ID is pushed on to a stack while + /// computing an epsilon closure. + stack_pushes: Vec<u64>, + /// The number of times a particular state ID is inserted into a sparse set + /// while computing an epsilon closure. + set_inserts: Vec<u64>, +} + +#[cfg(feature = "internal-instrument-pikevm")] +impl Counters { + fn empty() -> Counters { + Counters { + state_sets: alloc::collections::BTreeMap::new(), + steps: vec![], + closures: vec![], + stack_pushes: vec![], + set_inserts: vec![], } } - fn clear(&mut self) { - self.stack.clear(); - self.clist.set.clear(); - self.nlist.set.clear(); + fn reset(&mut self, nfa: &NFA) { + let len = nfa.states().len(); + + self.state_sets.clear(); + + self.steps.clear(); + self.steps.resize(len, 0); + + self.closures.clear(); + self.closures.resize(len, 0); + + self.stack_pushes.clear(); + self.stack_pushes.resize(len, 0); + + self.set_inserts.clear(); + self.set_inserts.resize(len, 0); + } + + fn eprint(&self, nfa: &NFA) { + trace!("===== START PikeVM Instrumentation Output ====="); + // We take the top-K most occurring state sets. Otherwise the output + // is likely to be overwhelming. And we probably only care about the + // most frequently occurring ones anyway. + const LIMIT: usize = 20; + let mut set_counts = + self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>(); + set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); + trace!("## PikeVM frequency of state sets (top {})", LIMIT); + for (set, count) in set_counts.iter().take(LIMIT) { + trace!("{:?}: {}", set, count); + } + if set_counts.len() > LIMIT { + trace!( + "... {} sets omitted (out of {} total)", + set_counts.len() - LIMIT, + set_counts.len(), + ); + } + + trace!(""); + trace!("## PikeVM total frequency of events"); + trace!( + "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}", + self.steps.iter().copied().sum::<u64>(), + self.closures.iter().copied().sum::<u64>(), + self.stack_pushes.iter().copied().sum::<u64>(), + self.set_inserts.iter().copied().sum::<u64>(), + ); + + trace!(""); + trace!("## PikeVM frequency of events broken down by state"); + for sid in 0..self.steps.len() { + trace!( + "{:06}: steps: {}, closures: {}, \ + stack-pushes: {}, set-inserts: {}", + sid, + self.steps[sid], + self.closures[sid], + self.stack_pushes[sid], + self.set_inserts[sid], + ); + } + + trace!(""); + trace!("## NFA debug display"); + trace!("{:?}", nfa); + trace!("===== END PikeVM Instrumentation Output ====="); } - fn swap(&mut self) { - core::mem::swap(&mut self.clist, &mut self.nlist); + fn record_state_set(&mut self, set: &SparseSet) { + let set = set.iter().collect::<Vec<StateID>>(); + *self.state_sets.entry(set).or_insert(0) += 1; } -} -impl Threads { - fn new(nfa: &NFA) -> Threads { - let mut threads = Threads { - set: SparseSet::new(0), - caps: vec![], - slots_per_thread: 0, - }; - threads.resize(nfa); - threads + fn record_step(&mut self, sid: StateID) { + self.steps[sid] += 1; } - fn resize(&mut self, nfa: &NFA) { - if nfa.states().len() == self.set.capacity() { - return; - } - self.slots_per_thread = nfa.capture_slot_len(); - self.set.resize(nfa.states().len()); - self.caps.resize(self.slots_per_thread * nfa.states().len(), None); + fn record_closure(&mut self, sid: StateID) { + self.closures[sid] += 1; + } + + fn record_stack_push(&mut self, sid: StateID) { + self.stack_pushes[sid] += 1; } - fn caps(&mut self, sid: StateID) -> &mut [Slot] { - let i = sid.as_usize() * self.slots_per_thread; - &mut self.caps[i..i + self.slots_per_thread] + fn record_set_insert(&mut self, sid: StateID) { + self.set_inserts[sid] += 1; } } diff --git a/vendor/regex-automata/src/nfa/thompson/range_trie.rs b/vendor/regex-automata/src/nfa/thompson/range_trie.rs index 92f36ce3a..2d43a5b6f 100644 --- a/vendor/regex-automata/src/nfa/thompson/range_trie.rs +++ b/vendor/regex-automata/src/nfa/thompson/range_trie.rs @@ -1,165 +1,160 @@ -// I've called the primary data structure in this module a "range trie." As far -// as I can tell, there is no prior art on a data structure like this, however, -// it's likely someone somewhere has built something like it. Searching for -// "range trie" turns up the paper "Range Tries for Scalable Address Lookup," -// but it does not appear relevant. -// -// The range trie is just like a trie in that it is a special case of a -// deterministic finite state machine. It has states and each state has a set -// of transitions to other states. It is acyclic, and, like a normal trie, -// it makes no attempt to reuse common suffixes among its elements. The key -// difference between a normal trie and a range trie below is that a range trie -// operates on *contiguous sequences* of bytes instead of singleton bytes. -// One could say say that our alphabet is ranges of bytes instead of bytes -// themselves, except a key part of range trie construction is splitting ranges -// apart to ensure there is at most one transition that can be taken for any -// byte in a given state. -// -// I've tried to explain the details of how the range trie works below, so -// for now, we are left with trying to understand what problem we're trying to -// solve. Which is itself fairly involved! -// -// At the highest level, here's what we want to do. We want to convert a -// sequence of Unicode codepoints into a finite state machine whose transitions -// are over *bytes* and *not* Unicode codepoints. We want this because it makes -// said finite state machines much smaller and much faster to execute. As a -// simple example, consider a byte oriented automaton for all Unicode scalar -// values (0x00 through 0x10FFFF, not including surrogate codepoints): -// -// [00-7F] -// [C2-DF][80-BF] -// [E0-E0][A0-BF][80-BF] -// [E1-EC][80-BF][80-BF] -// [ED-ED][80-9F][80-BF] -// [EE-EF][80-BF][80-BF] -// [F0-F0][90-BF][80-BF][80-BF] -// [F1-F3][80-BF][80-BF][80-BF] -// [F4-F4][80-8F][80-BF][80-BF] -// -// (These byte ranges are generated via the regex-syntax::utf8 module, which -// was based on Russ Cox's code in RE2, which was in turn based on Ken -// Thompson's implementation of the same idea in his Plan9 implementation of -// grep.) -// -// It should be fairly straight-forward to see how one could compile this into -// a DFA. The sequences are sorted and non-overlapping. Essentially, you could -// build a trie from this fairly easy. The problem comes when your initial -// range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class -// represented by '\w' contains only a tenth of the codepoints that -// 0x00-0x10FFFF contains, but if we were to write out the byte based ranges -// as we did above, the list would stretch to 892 entries! This turns into -// quite a large NFA with a few thousand states. Turning this beast into a DFA -// takes quite a bit of time. We are thus left with trying to trim down the -// number of states we produce as early as possible. -// -// One approach (used by RE2 and still by the regex crate, at time of writing) -// is to try to find common suffixes while building NFA states for the above -// and reuse them. This is very cheap to do and one can control precisely how -// much extra memory you want to use for the cache. -// -// Another approach, however, is to reuse an algorithm for constructing a -// *minimal* DFA from a sorted sequence of inputs. I don't want to go into -// the full details here, but I explain it in more depth in my blog post on -// FSTs[1]. Note that the algorithm was not invented by me, but was published -// in paper by Daciuk et al. in 2000 called "Incremental Construction of -// MinimalAcyclic Finite-State Automata." Like the suffix cache approach above, -// it is also possible to control the amount of extra memory one uses, although -// this usually comes with the cost of sacrificing true minimality. (But it's -// typically close enough with a reasonably sized cache of states.) -// -// The catch is that Daciuk's algorithm only works if you add your keys in -// lexicographic ascending order. In our case, since we're dealing with ranges, -// we also need the additional requirement that ranges are either equivalent -// or do not overlap at all. For example, if one were given the following byte -// ranges: -// -// [BC-BF][80-BF] -// [BC-BF][90-BF] -// -// Then Daciuk's algorithm would not work, since there is nothing to handle the -// fact that the ranges overlap. They would need to be split apart. Thankfully, -// Thompson's algorithm for producing byte ranges for Unicode codepoint ranges -// meets both of our requirements. (A proof for this eludes me, but it appears -// true.) -// -// ... however, we would also like to be able to compile UTF-8 automata in -// reverse. We want this because in order to find the starting location of a -// match using a DFA, we need to run a second DFA---a reversed version of the -// forward DFA---backwards to discover the match location. Unfortunately, if -// we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are -// can overlap, even if they are sorted: -// -// [00-7F] -// [80-BF][80-9F][ED-ED] -// [80-BF][80-BF][80-8F][F4-F4] -// [80-BF][80-BF][80-BF][F1-F3] -// [80-BF][80-BF][90-BF][F0-F0] -// [80-BF][80-BF][E1-EC] -// [80-BF][80-BF][EE-EF] -// [80-BF][A0-BF][E0-E0] -// [80-BF][C2-DF] -// -// For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have -// overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no -// simple way to apply Daciuk's algorithm. -// -// And thus, the range trie was born. The range trie's only purpose is to take -// sequences of byte ranges like the ones above, collect them into a trie and -// then spit them in a sorted fashion with no overlapping ranges. For example, -// 0x00-0x10FFFF gets translated to: -// -// [0-7F] -// [80-BF][80-9F][80-8F][F1-F3] -// [80-BF][80-9F][80-8F][F4] -// [80-BF][80-9F][90-BF][F0] -// [80-BF][80-9F][90-BF][F1-F3] -// [80-BF][80-9F][E1-EC] -// [80-BF][80-9F][ED] -// [80-BF][80-9F][EE-EF] -// [80-BF][A0-BF][80-8F][F1-F3] -// [80-BF][A0-BF][80-8F][F4] -// [80-BF][A0-BF][90-BF][F0] -// [80-BF][A0-BF][90-BF][F1-F3] -// [80-BF][A0-BF][E0] -// [80-BF][A0-BF][E1-EC] -// [80-BF][A0-BF][EE-EF] -// [80-BF][C2-DF] -// -// We've thus satisfied our requirements for running Daciuk's algorithm. All -// sequences of ranges are sorted, and any corresponding ranges are either -// exactly equivalent or non-overlapping. -// -// In effect, a range trie is building a DFA from a sequence of arbitrary -// byte ranges. But it uses an algoritm custom tailored to its input, so it -// is not as costly as traditional DFA construction. While it is still quite -// a bit more costly than the forward's case (which only needs Daciuk's -// algorithm), it winds up saving a substantial amount of time if one is doing -// a full DFA powerset construction later by virtue of producing a much much -// smaller NFA. -// -// [1] - https://blog.burntsushi.net/transducers/ -// [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 - -use core::{cell::RefCell, fmt, mem, ops::RangeInclusive, u32}; +/* +I've called the primary data structure in this module a "range trie." As far +as I can tell, there is no prior art on a data structure like this, however, +it's likely someone somewhere has built something like it. Searching for +"range trie" turns up the paper "Range Tries for Scalable Address Lookup," +but it does not appear relevant. + +The range trie is just like a trie in that it is a special case of a +deterministic finite state machine. It has states and each state has a set +of transitions to other states. It is acyclic, and, like a normal trie, +it makes no attempt to reuse common suffixes among its elements. The key +difference between a normal trie and a range trie below is that a range trie +operates on *contiguous sequences* of bytes instead of singleton bytes. +One could say say that our alphabet is ranges of bytes instead of bytes +themselves, except a key part of range trie construction is splitting ranges +apart to ensure there is at most one transition that can be taken for any +byte in a given state. + +I've tried to explain the details of how the range trie works below, so +for now, we are left with trying to understand what problem we're trying to +solve. Which is itself fairly involved! + +At the highest level, here's what we want to do. We want to convert a +sequence of Unicode codepoints into a finite state machine whose transitions +are over *bytes* and *not* Unicode codepoints. We want this because it makes +said finite state machines much smaller and much faster to execute. As a +simple example, consider a byte oriented automaton for all Unicode scalar +values (0x00 through 0x10FFFF, not including surrogate codepoints): + + [00-7F] + [C2-DF][80-BF] + [E0-E0][A0-BF][80-BF] + [E1-EC][80-BF][80-BF] + [ED-ED][80-9F][80-BF] + [EE-EF][80-BF][80-BF] + [F0-F0][90-BF][80-BF][80-BF] + [F1-F3][80-BF][80-BF][80-BF] + [F4-F4][80-8F][80-BF][80-BF] + +(These byte ranges are generated via the regex-syntax::utf8 module, which +was based on Russ Cox's code in RE2, which was in turn based on Ken +Thompson's implementation of the same idea in his Plan9 implementation of +grep.) + +It should be fairly straight-forward to see how one could compile this into +a DFA. The sequences are sorted and non-overlapping. Essentially, you could +build a trie from this fairly easy. The problem comes when your initial +range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class +represented by '\w' contains only a tenth of the codepoints that +0x00-0x10FFFF contains, but if we were to write out the byte based ranges +as we did above, the list would stretch to 892 entries! This turns into +quite a large NFA with a few thousand states. Turning this beast into a DFA +takes quite a bit of time. We are thus left with trying to trim down the +number of states we produce as early as possible. + +One approach (used by RE2 and still by the regex crate, at time of writing) +is to try to find common suffixes while building NFA states for the above +and reuse them. This is very cheap to do and one can control precisely how +much extra memory you want to use for the cache. + +Another approach, however, is to reuse an algorithm for constructing a +*minimal* DFA from a sorted sequence of inputs. I don't want to go into +the full details here, but I explain it in more depth in my blog post on +FSTs[1]. Note that the algorithm was not invented by me, but was published +in paper by Daciuk et al. in 2000 called "Incremental Construction of +MinimalAcyclic Finite-State Automata." Like the suffix cache approach above, +it is also possible to control the amount of extra memory one uses, although +this usually comes with the cost of sacrificing true minimality. (But it's +typically close enough with a reasonably sized cache of states.) + +The catch is that Daciuk's algorithm only works if you add your keys in +lexicographic ascending order. In our case, since we're dealing with ranges, +we also need the additional requirement that ranges are either equivalent +or do not overlap at all. For example, if one were given the following byte +ranges: + + [BC-BF][80-BF] + [BC-BF][90-BF] + +Then Daciuk's algorithm would not work, since there is nothing to handle the +fact that the ranges overlap. They would need to be split apart. Thankfully, +Thompson's algorithm for producing byte ranges for Unicode codepoint ranges +meets both of our requirements. (A proof for this eludes me, but it appears +true.) + +... however, we would also like to be able to compile UTF-8 automata in +reverse. We want this because in order to find the starting location of a +match using a DFA, we need to run a second DFA---a reversed version of the +forward DFA---backwards to discover the match location. Unfortunately, if +we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are +can overlap, even if they are sorted: + + [00-7F] + [80-BF][80-9F][ED-ED] + [80-BF][80-BF][80-8F][F4-F4] + [80-BF][80-BF][80-BF][F1-F3] + [80-BF][80-BF][90-BF][F0-F0] + [80-BF][80-BF][E1-EC] + [80-BF][80-BF][EE-EF] + [80-BF][A0-BF][E0-E0] + [80-BF][C2-DF] + +For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have +overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no +simple way to apply Daciuk's algorithm. + +And thus, the range trie was born. The range trie's only purpose is to take +sequences of byte ranges like the ones above, collect them into a trie and then +spit them out in a sorted fashion with no overlapping ranges. For example, +0x00-0x10FFFF gets translated to: + + [0-7F] + [80-BF][80-9F][80-8F][F1-F3] + [80-BF][80-9F][80-8F][F4] + [80-BF][80-9F][90-BF][F0] + [80-BF][80-9F][90-BF][F1-F3] + [80-BF][80-9F][E1-EC] + [80-BF][80-9F][ED] + [80-BF][80-9F][EE-EF] + [80-BF][A0-BF][80-8F][F1-F3] + [80-BF][A0-BF][80-8F][F4] + [80-BF][A0-BF][90-BF][F0] + [80-BF][A0-BF][90-BF][F1-F3] + [80-BF][A0-BF][E0] + [80-BF][A0-BF][E1-EC] + [80-BF][A0-BF][EE-EF] + [80-BF][C2-DF] + +We've thus satisfied our requirements for running Daciuk's algorithm. All +sequences of ranges are sorted, and any corresponding ranges are either +exactly equivalent or non-overlapping. + +In effect, a range trie is building a DFA from a sequence of arbitrary byte +ranges. But it uses an algorithm custom tailored to its input, so it is not as +costly as traditional DFA construction. While it is still quite a bit more +costly than the forward case (which only needs Daciuk's algorithm), it winds +up saving a substantial amount of time if one is doing a full DFA powerset +construction later by virtue of producing a much much smaller NFA. + +[1] - https://blog.burntsushi.net/transducers/ +[2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 +*/ + +use core::{cell::RefCell, convert::TryFrom, fmt, mem, ops::RangeInclusive}; use alloc::{format, string::String, vec, vec::Vec}; use regex_syntax::utf8::Utf8Range; -/// A smaller state ID means more effective use of the CPU cache and less -/// time spent copying. The implementation below will panic if the state ID -/// space is exhausted, but in order for that to happen, the range trie itself -/// would use well over 100GB of memory. Moreover, it's likely impossible -/// for the state ID space to get that big. In fact, it's likely that even a -/// u16 would be good enough here. But it's not quite clear how to prove this. -type StateID = u32; +use crate::util::primitives::StateID; /// There is only one final state in this trie. Every sequence of byte ranges /// added shares the same final state. -const FINAL: StateID = 0; +const FINAL: StateID = StateID::ZERO; /// The root state of the trie. -const ROOT: StateID = 1; +const ROOT: StateID = StateID::new_unchecked(1); /// A range trie represents an ordered set of sequences of bytes. /// @@ -193,7 +188,7 @@ pub struct RangeTrie { /// A stack for traversing this trie to yield sequences of byte ranges in /// lexicographic order. iter_stack: RefCell<Vec<NextIter>>, - /// A bufer that stores the current sequence during iteration. + /// A buffer that stores the current sequence during iteration. iter_ranges: RefCell<Vec<Utf8Range>>, /// A stack used for traversing the trie in order to (deeply) duplicate /// a state. States are recursively duplicated when ranges are split. @@ -431,14 +426,16 @@ impl RangeTrie { } pub fn add_empty(&mut self) -> StateID { - if self.states.len() as u64 > u32::MAX as u64 { - // This generally should not happen since a range trie is only - // ever used to compile a single sequence of Unicode scalar values. - // If we ever got to this point, we would, at *minimum*, be using - // 96GB in just the range trie alone. - panic!("too many sequences added to range trie"); - } - let id = self.states.len() as StateID; + let id = match StateID::try_from(self.states.len()) { + Ok(id) => id, + Err(_) => { + // This generally should not happen since a range trie is + // only ever used to compile a single sequence of Unicode + // scalar values. If we ever got to this point, we would, at + // *minimum*, be using 96GB in just the range trie alone. + panic!("too many sequences added to range trie"); + } + }; // If we have some free states available, then use them to avoid // more allocations. if let Some(mut state) = self.free.pop() { @@ -542,12 +539,12 @@ impl RangeTrie { /// Return an immutable borrow for the state with the given ID. fn state(&self, id: StateID) -> &State { - &self.states[id as usize] + &self.states[id] } /// Return a mutable borrow for the state with the given ID. fn state_mut(&mut self, id: StateID) -> &mut State { - &mut self.states[id as usize] + &mut self.states[id] } } @@ -625,7 +622,7 @@ struct NextIter { } /// The next state to process during insertion and any remaining ranges that we -/// want to add for a partcular sequence of ranges. The first such instance +/// want to add for a particular sequence of ranges. The first such instance /// is always the root state along with all ranges given. #[derive(Clone, Debug)] struct NextInsert { @@ -651,7 +648,7 @@ impl NextInsert { let mut tmp = [Utf8Range { start: 0, end: 0 }; 4]; tmp[..len].copy_from_slice(ranges); - NextInsert { state_id, ranges: tmp, len: len as u8 } + NextInsert { state_id, ranges: tmp, len: u8::try_from(len).unwrap() } } /// Push a new empty state to visit along with any remaining ranges that @@ -679,7 +676,7 @@ impl NextInsert { /// Return the remaining ranges to insert. fn ranges(&self) -> &[Utf8Range] { - &self.ranges[..self.len as usize] + &self.ranges[..usize::try_from(self.len).unwrap()] } } @@ -871,7 +868,7 @@ impl fmt::Debug for RangeTrie { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "")?; for (i, state) in self.states.iter().enumerate() { - let status = if i == FINAL as usize { '*' } else { ' ' }; + let status = if i == FINAL.as_usize() { '*' } else { ' ' }; writeln!(f, "{}{:06}: {:?}", status, i, state)?; } Ok(()) @@ -893,12 +890,19 @@ impl fmt::Debug for State { impl fmt::Debug for Transition { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.range.start == self.range.end { - write!(f, "{:02X} => {:02X}", self.range.start, self.next_id) + write!( + f, + "{:02X} => {:02X}", + self.range.start, + self.next_id.as_usize(), + ) } else { write!( f, "{:02X}-{:02X} => {:02X}", - self.range.start, self.range.end, self.next_id + self.range.start, + self.range.end, + self.next_id.as_usize(), ) } } |