summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/nfa/thompson/nfa.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/nfa/thompson/nfa.rs')
-rw-r--r--third_party/rust/regex-automata/src/nfa/thompson/nfa.rs2101
1 files changed, 2101 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/nfa/thompson/nfa.rs b/third_party/rust/regex-automata/src/nfa/thompson/nfa.rs
new file mode 100644
index 0000000000..2108fa3381
--- /dev/null
+++ b/third_party/rust/regex-automata/src/nfa/thompson/nfa.rs
@@ -0,0 +1,2101 @@
+use core::{fmt, mem};
+
+use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec};
+
+#[cfg(feature = "syntax")]
+use crate::nfa::thompson::{
+ compiler::{Compiler, Config},
+ error::BuildError,
+};
+use crate::{
+ nfa::thompson::builder::Builder,
+ util::{
+ alphabet::{self, ByteClassSet, ByteClasses},
+ captures::{GroupInfo, GroupInfoError},
+ look::{Look, LookMatcher, LookSet},
+ primitives::{
+ IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID,
+ },
+ sparse_set::SparseSet,
+ },
+};
+
+/// A byte oriented Thompson non-deterministic finite automaton (NFA).
+///
+/// A Thompson NFA is a finite state machine that permits unconditional epsilon
+/// transitions, but guarantees that there exists at most one non-epsilon
+/// transition for each element in the alphabet for each state.
+///
+/// An NFA may be used directly for searching, for analysis or to build
+/// a deterministic finite automaton (DFA).
+///
+/// # Cheap clones
+///
+/// Since an NFA is a core data type in this crate that many other regex
+/// engines are based on top of, it is convenient to give ownership of an NFA
+/// to said regex engines. Because of this, an NFA uses reference counting
+/// internally. Therefore, it is cheap to clone and it is encouraged to do so.
+///
+/// # Capabilities
+///
+/// Using an NFA for searching via the
+/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount
+/// of "power" of any regex engine in this crate. Namely, it supports the
+/// following in all cases:
+///
+/// 1. Detection of a match.
+/// 2. Location of a match, including both the start and end offset, in a
+/// single pass of the haystack.
+/// 3. Location of matching capturing groups.
+/// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are
+/// present.
+///
+/// # Capturing Groups
+///
+/// Groups refer to parenthesized expressions inside a regex pattern. They look
+/// like this, where `exp` is an arbitrary regex:
+///
+/// * `(exp)` - An unnamed capturing group.
+/// * `(?P<name>exp)` or `(?<name>exp)` - A named capturing group.
+/// * `(?:exp)` - A non-capturing group.
+/// * `(?i:exp)` - A non-capturing group that sets flags.
+///
+/// Only the first two forms are said to be _capturing_. Capturing
+/// means that the last position at which they match is reportable. The
+/// [`Captures`](crate::util::captures::Captures) type provides convenient
+/// access to the match positions of capturing groups, which includes looking
+/// up capturing groups by their name.
+///
+/// # Byte oriented
+///
+/// This NFA is byte oriented, which means that all of its transitions are
+/// defined on bytes. In other words, the alphabet of an NFA consists of the
+/// 256 different byte values.
+///
+/// While DFAs nearly demand that they be byte oriented for performance
+/// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed,
+/// a previous version of this NFA supported both byte and codepoint oriented
+/// modes. A codepoint oriented mode can work because an NFA fundamentally uses
+/// a sparse representation of transitions, which works well with the large
+/// sparse space of Unicode codepoints.
+///
+/// Nevertheless, this NFA is only byte oriented. This choice is primarily
+/// driven by implementation simplicity, and also in part memory usage. In
+/// practice, performance between the two is roughly comparable. However,
+/// building a DFA (including a hybrid DFA) really wants a byte oriented NFA.
+/// So if we do have a codepoint oriented NFA, then we also need to generate
+/// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only
+/// generating byte oriented NFAs, we can produce one less NFA. In other words,
+/// if we made our NFA codepoint oriented, we'd need to *also* make it support
+/// a byte oriented mode, which is more complicated. But a byte oriented mode
+/// can support everything.
+///
+/// # Differences with DFAs
+///
+/// At the theoretical level, the precise difference between an NFA and a DFA
+/// is that, in a DFA, for every state, an input symbol unambiguously refers
+/// to a single transition _and_ that an input symbol is required for each
+/// transition. At a practical level, this permits DFA implementations to be
+/// implemented at their core with a small constant number of CPU instructions
+/// for each byte of input searched. In practice, this makes them quite a bit
+/// faster than NFAs _in general_. Namely, in order to execute a search for any
+/// Thompson NFA, one needs to keep track of a _set_ of states, and execute
+/// the possible transitions on all of those states for each input symbol.
+/// Overall, this results in much more overhead. To a first approximation, one
+/// can expect DFA searches to be about an order of magnitude faster.
+///
+/// So why use an NFA at all? The main advantage of an NFA is that it takes
+/// linear time (in the size of the pattern string after repetitions have been
+/// expanded) to build and linear memory usage. A DFA, on the other hand, may
+/// take exponential time and/or space to build. Even in non-pathological
+/// cases, DFAs often take quite a bit more memory than their NFA counterparts,
+/// _especially_ if large Unicode character classes are involved. Of course,
+/// an NFA also provides additional capabilities. For example, it can match
+/// Unicode word boundaries on non-ASCII text and resolve the positions of
+/// capturing groups.
+///
+/// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a
+/// good balance between an NFA and a DFA. It avoids the exponential build time
+/// of a DFA while maintaining its fast search time. The downside of a hybrid
+/// NFA/DFA is that in some cases it can be slower at search time than the NFA.
+/// (It also has less functionality than a pure NFA. It cannot handle Unicode
+/// word boundaries on non-ASCII text and cannot resolve capturing groups.)
+///
+/// # Example
+///
+/// This shows how to build an NFA with the default configuration and execute a
+/// search using the Pike VM.
+///
+/// ```
+/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+///
+/// let re = PikeVM::new(r"foo[0-9]+")?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+///
+/// let expected = Some(Match::must(0, 0..8));
+/// re.captures(&mut cache, b"foo12345", &mut caps);
+/// assert_eq!(expected, caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: resolving capturing groups
+///
+/// This example shows how to parse some simple dates and extract the
+/// components of each date via capturing groups.
+///
+/// ```
+/// # if cfg!(miri) { return Ok(()); } // miri takes too long
+/// use regex_automata::{
+/// nfa::thompson::pikevm::PikeVM,
+/// util::captures::Captures,
+/// };
+///
+/// let vm = PikeVM::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})")?;
+/// let mut cache = vm.create_cache();
+///
+/// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05";
+/// let all: Vec<Captures> = vm.captures_iter(
+/// &mut cache, haystack.as_bytes()
+/// ).collect();
+/// // There should be a total of 3 matches.
+/// assert_eq!(3, all.len());
+/// // The year from the second match is '2013'.
+/// let span = all[1].get_group_by_name("y").unwrap();
+/// assert_eq!("2013", &haystack[span]);
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// This example shows that only the last match of a capturing group is
+/// reported, even if it had to match multiple times for an overall match
+/// to occur.
+///
+/// ```
+/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
+///
+/// let re = PikeVM::new(r"([a-z]){4}")?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+///
+/// let haystack = b"quux";
+/// re.captures(&mut cache, haystack, &mut caps);
+/// assert!(caps.is_match());
+/// assert_eq!(Some(Span::from(3..4)), caps.get_group(1));
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone)]
+pub struct NFA(
+ // We make NFAs reference counted primarily for two reasons. First is that
+ // the NFA type itself is quite large (at least 0.5KB), and so it makes
+ // sense to put it on the heap by default anyway. Second is that, for Arc
+ // specifically, this enables cheap clones. This tends to be useful because
+ // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA)
+ // all want to hang on to an NFA for use during search time. We could
+ // provide the NFA at search time via a function argument, but this makes
+ // for an unnecessarily annoying API. Instead, we just let each structure
+ // share ownership of the NFA. Using a deep clone would not be smart, since
+ // the NFA can use quite a bit of heap space.
+ Arc<Inner>,
+);
+
+impl NFA {
+ /// Parse the given regular expression using a default configuration and
+ /// build an NFA from it.
+ ///
+ /// If you want a non-default configuration, then use the NFA
+ /// [`Compiler`] with a [`Config`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new(r"foo[0-9]+")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let expected = Some(Match::must(0, 0..8));
+ /// re.captures(&mut cache, b"foo12345", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new(pattern: &str) -> Result<NFA, BuildError> {
+ NFA::compiler().build(pattern)
+ }
+
+ /// Parse the given regular expressions using a default configuration and
+ /// build a multi-NFA from them.
+ ///
+ /// If you want a non-default configuration, then use the NFA
+ /// [`Compiler`] with a [`Config`].
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
+ ///
+ /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let expected = Some(Match::must(1, 0..3));
+ /// re.captures(&mut cache, b"foo12345bar", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<NFA, BuildError> {
+ NFA::compiler().build_many(patterns)
+ }
+
+ /// Returns an NFA with a single regex pattern that always matches at every
+ /// position.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ ///
+ /// let re = PikeVM::new_from_nfa(NFA::always_match())?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let expected = Some(Match::must(0, 0..0));
+ /// re.captures(&mut cache, b"", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ /// re.captures(&mut cache, b"foo", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn always_match() -> NFA {
+ // We could use NFA::new("") here and we'd get the same semantics, but
+ // hand-assembling the NFA (as below) does the same thing with a fewer
+ // number of states. It also avoids needing the 'syntax' feature
+ // enabled.
+ //
+ // Technically all we need is the "match" state, but we add the
+ // "capture" states so that the PikeVM can use this NFA.
+ //
+ // The unwraps below are OK because we add so few states that they will
+ // never exhaust any default limits in any environment.
+ let mut builder = Builder::new();
+ let pid = builder.start_pattern().unwrap();
+ assert_eq!(pid.as_usize(), 0);
+ let start_id =
+ builder.add_capture_start(StateID::ZERO, 0, None).unwrap();
+ let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap();
+ let match_id = builder.add_match().unwrap();
+ builder.patch(start_id, end_id).unwrap();
+ builder.patch(end_id, match_id).unwrap();
+ let pid = builder.finish_pattern(start_id).unwrap();
+ assert_eq!(pid.as_usize(), 0);
+ builder.build(start_id, start_id).unwrap()
+ }
+
+ /// Returns an NFA that never matches at any position.
+ ///
+ /// This is a convenience routine for creating an NFA with zero patterns.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
+ ///
+ /// let re = PikeVM::new_from_nfa(NFA::never_match())?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// re.captures(&mut cache, b"", &mut caps);
+ /// assert!(!caps.is_match());
+ /// re.captures(&mut cache, b"foo", &mut caps);
+ /// assert!(!caps.is_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn never_match() -> NFA {
+ // This always succeeds because it only requires one NFA state, which
+ // will never exhaust any (default) limits.
+ let mut builder = Builder::new();
+ let sid = builder.add_fail().unwrap();
+ builder.build(sid, sid).unwrap()
+ }
+
+ /// Return a default configuration for an `NFA`.
+ ///
+ /// This is a convenience routine to avoid needing to import the `Config`
+ /// type when customizing the construction of an NFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build an NFA with a small size limit that
+ /// results in a compilation error for any regex that tries to use more
+ /// heap memory than the configured limit.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM};
+ ///
+ /// let result = PikeVM::builder()
+ /// .thompson(NFA::config().nfa_size_limit(Some(1_000)))
+ /// // Remember, \w is Unicode-aware by default and thus huge.
+ /// .build(r"\w+");
+ /// assert!(result.is_err());
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn config() -> Config {
+ Config::new()
+ }
+
+ /// Return a compiler for configuring the construction of an `NFA`.
+ ///
+ /// This is a convenience routine to avoid needing to import the
+ /// [`Compiler`] type in common cases.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build an NFA that is permitted match invalid
+ /// UTF-8. Without the additional syntax configuration here, compilation of
+ /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::pikevm::PikeVM,
+ /// util::syntax,
+ /// Match,
+ /// };
+ ///
+ /// let re = PikeVM::builder()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .build(r"[a-z]+(?-u:.)")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// let expected = Some(Match::must(0, 1..5));
+ /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[cfg(feature = "syntax")]
+ pub fn compiler() -> Compiler {
+ Compiler::new()
+ }
+
+ /// Returns an iterator over all pattern identifiers in this NFA.
+ ///
+ /// Pattern IDs are allocated in sequential order starting from zero,
+ /// where the order corresponds to the order of patterns provided to the
+ /// [`NFA::new_many`] constructor.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// let pids: Vec<PatternID> = nfa.patterns().collect();
+ /// assert_eq!(pids, vec![
+ /// PatternID::must(0),
+ /// PatternID::must(1),
+ /// PatternID::must(2),
+ /// ]);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn patterns(&self) -> PatternIter<'_> {
+ PatternIter {
+ it: PatternID::iter(self.pattern_len()),
+ _marker: core::marker::PhantomData,
+ }
+ }
+
+ /// Returns the total number of regex patterns in this NFA.
+ ///
+ /// This may return zero if the NFA was constructed with no patterns. In
+ /// this case, the NFA can never produce a match for any input.
+ ///
+ /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because
+ /// NFA construction will fail if too many patterns are added.
+ ///
+ /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
+ /// assert_eq!(3, nfa.pattern_len());
+ ///
+ /// let nfa = NFA::never_match();
+ /// assert_eq!(0, nfa.pattern_len());
+ ///
+ /// let nfa = NFA::always_match();
+ /// assert_eq!(1, nfa.pattern_len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn pattern_len(&self) -> usize {
+ self.0.start_pattern.len()
+ }
+
+ /// Return the state identifier of the initial anchored state of this NFA.
+ ///
+ /// The returned identifier is guaranteed to be a valid index into the
+ /// slice returned by [`NFA::states`], and is also a valid argument to
+ /// [`NFA::state`].
+ ///
+ /// # Example
+ ///
+ /// This example shows a somewhat contrived example where we can easily
+ /// predict the anchored starting state.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures};
+ ///
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build("a")?;
+ /// let state = nfa.state(nfa.start_anchored());
+ /// match *state {
+ /// State::ByteRange { trans } => {
+ /// assert_eq!(b'a', trans.start);
+ /// assert_eq!(b'a', trans.end);
+ /// }
+ /// _ => unreachable!("unexpected state"),
+ /// }
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn start_anchored(&self) -> StateID {
+ self.0.start_anchored
+ }
+
+ /// Return the state identifier of the initial unanchored state of this
+ /// NFA.
+ ///
+ /// This is equivalent to the identifier returned by
+ /// [`NFA::start_anchored`] when the NFA has no unanchored starting state.
+ ///
+ /// The returned identifier is guaranteed to be a valid index into the
+ /// slice returned by [`NFA::states`], and is also a valid argument to
+ /// [`NFA::state`].
+ ///
+ /// # Example
+ ///
+ /// This example shows that the anchored and unanchored starting states
+ /// are equivalent when an anchored NFA is built.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa = NFA::new("^a")?;
+ /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn start_unanchored(&self) -> StateID {
+ self.0.start_unanchored
+ }
+
+ /// Return the state identifier of the initial anchored state for the given
+ /// pattern, or `None` if there is no pattern corresponding to the given
+ /// identifier.
+ ///
+ /// If one uses the starting state for a particular pattern, then the only
+ /// match that can be returned is for the corresponding pattern.
+ ///
+ /// The returned identifier is guaranteed to be a valid index into the
+ /// slice returned by [`NFA::states`], and is also a valid argument to
+ /// [`NFA::state`].
+ ///
+ /// # Errors
+ ///
+ /// If the pattern doesn't exist in this NFA, then this returns an error.
+ /// This occurs when `pid.as_usize() >= nfa.pattern_len()`.
+ ///
+ /// # Example
+ ///
+ /// This example shows that the anchored and unanchored starting states
+ /// are equivalent when an anchored NFA is built.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new_many(&["^a", "^b"])?;
+ /// // The anchored and unanchored states for the entire NFA are the same,
+ /// // since all of the patterns are anchored.
+ /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored());
+ /// // But the anchored starting states for each pattern are distinct,
+ /// // because these starting states can only lead to matches for the
+ /// // corresponding pattern.
+ /// let anchored = Some(nfa.start_anchored());
+ /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0)));
+ /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1)));
+ /// // Requesting a pattern not in the NFA will result in None:
+ /// assert_eq!(None, nfa.start_pattern(PatternID::must(2)));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn start_pattern(&self, pid: PatternID) -> Option<StateID> {
+ self.0.start_pattern.get(pid.as_usize()).copied()
+ }
+
+ /// Get the byte class set for this NFA.
+ ///
+ /// A byte class set is a partitioning of this NFA's alphabet into
+ /// equivalence classes. Any two bytes in the same equivalence class are
+ /// guaranteed to never discriminate between a match or a non-match. (The
+ /// partitioning may not be minimal.)
+ ///
+ /// Byte classes are used internally by this crate when building DFAs.
+ /// Namely, among other optimizations, they enable a space optimization
+ /// where the DFA's internal alphabet is defined over the equivalence
+ /// classes of bytes instead of all possible byte values. The former is
+ /// often quite a bit smaller than the latter, which permits the DFA to use
+ /// less space for its transition table.
+ #[inline]
+ pub(crate) fn byte_class_set(&self) -> &ByteClassSet {
+ &self.0.byte_class_set
+ }
+
+ /// Get the byte classes for this NFA.
+ ///
+ /// Byte classes represent a partitioning of this NFA's alphabet into
+ /// equivalence classes. Any two bytes in the same equivalence class are
+ /// guaranteed to never discriminate between a match or a non-match. (The
+ /// partitioning may not be minimal.)
+ ///
+ /// Byte classes are used internally by this crate when building DFAs.
+ /// Namely, among other optimizations, they enable a space optimization
+ /// where the DFA's internal alphabet is defined over the equivalence
+ /// classes of bytes instead of all possible byte values. The former is
+ /// often quite a bit smaller than the latter, which permits the DFA to use
+ /// less space for its transition table.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to query the class of various bytes.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa = NFA::new("[a-z]+")?;
+ /// let classes = nfa.byte_classes();
+ /// // 'a' and 'z' are in the same class for this regex.
+ /// assert_eq!(classes.get(b'a'), classes.get(b'z'));
+ /// // But 'a' and 'A' are not.
+ /// assert_ne!(classes.get(b'a'), classes.get(b'A'));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn byte_classes(&self) -> &ByteClasses {
+ &self.0.byte_classes
+ }
+
+ /// Return a reference to the NFA state corresponding to the given ID.
+ ///
+ /// This is a convenience routine for `nfa.states()[id]`.
+ ///
+ /// # Panics
+ ///
+ /// This panics when the given identifier does not reference a valid state.
+ /// That is, when `id.as_usize() >= nfa.states().len()`.
+ ///
+ /// # Example
+ ///
+ /// The anchored state for a pattern will typically correspond to a
+ /// capturing state for that pattern. (Although, this is not an API
+ /// guarantee!)
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID};
+ ///
+ /// let nfa = NFA::new("a")?;
+ /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap());
+ /// match *state {
+ /// State::Capture { slot, .. } => {
+ /// assert_eq!(0, slot.as_usize());
+ /// }
+ /// _ => unreachable!("unexpected state"),
+ /// }
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn state(&self, id: StateID) -> &State {
+ &self.states()[id]
+ }
+
+ /// Returns a slice of all states in this NFA.
+ ///
+ /// The slice returned is indexed by `StateID`. This provides a convenient
+ /// way to access states while following transitions among those states.
+ ///
+ /// # Example
+ ///
+ /// This demonstrates that disabling UTF-8 mode can shrink the size of the
+ /// NFA considerably in some cases, especially when using Unicode character
+ /// classes.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa_unicode = NFA::new(r"\w")?;
+ /// let nfa_ascii = NFA::new(r"(?-u)\w")?;
+ /// // Yes, a factor of 45 difference. No lie.
+ /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn states(&self) -> &[State] {
+ &self.0.states
+ }
+
+ /// Returns the capturing group info for this NFA.
+ ///
+ /// The [`GroupInfo`] provides a way to map to and from capture index
+ /// and capture name for each pattern. It also provides a mapping from
+ /// each of the capturing groups in every pattern to their corresponding
+ /// slot offsets encoded in [`State::Capture`] states.
+ ///
+ /// Note that `GroupInfo` uses reference counting internally, such that
+ /// cloning a `GroupInfo` is very cheap.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to get a list of all capture group names for
+ /// a particular pattern.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, PatternID};
+ ///
+ /// let nfa = NFA::new(r"(a)(?P<foo>b)(c)(d)(?P<bar>e)")?;
+ /// // The first is the implicit group that is always unnammed. The next
+ /// // 5 groups are the explicit groups found in the concrete syntax above.
+ /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")];
+ /// let got: Vec<Option<&str>> =
+ /// nfa.group_info().pattern_names(PatternID::ZERO).collect();
+ /// assert_eq!(expected, got);
+ ///
+ /// // Using an invalid pattern ID will result in nothing yielded.
+ /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count();
+ /// assert_eq!(0, got);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn group_info(&self) -> &GroupInfo {
+ &self.0.group_info()
+ }
+
+ /// Returns true if and only if this NFA has at least one
+ /// [`Capture`](State::Capture) in its sequence of states.
+ ///
+ /// This is useful as a way to perform a quick test before attempting
+ /// something that does or does not require capture states. For example,
+ /// some regex engines (like the PikeVM) require capture states in order to
+ /// work at all.
+ ///
+ /// # Example
+ ///
+ /// This example shows a few different NFAs and whether they have captures
+ /// or not.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
+ ///
+ /// // Obviously has capture states.
+ /// let nfa = NFA::new("(a)")?;
+ /// assert!(nfa.has_capture());
+ ///
+ /// // Less obviously has capture states, because every pattern has at
+ /// // least one anonymous capture group corresponding to the match for the
+ /// // entire pattern.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(nfa.has_capture());
+ ///
+ /// // Other than hand building your own NFA, this is the only way to build
+ /// // an NFA without capturing groups. In general, you should only do this
+ /// // if you don't intend to use any of the NFA-oriented regex engines.
+ /// // Overall, capturing groups don't have many downsides. Although they
+ /// // can add a bit of noise to simple NFAs, so it can be nice to disable
+ /// // them for debugging purposes.
+ /// //
+ /// // Notice that 'has_capture' is false here even when we have an
+ /// // explicit capture group in the pattern.
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().which_captures(WhichCaptures::None))
+ /// .build("(a)")?;
+ /// assert!(!nfa.has_capture());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn has_capture(&self) -> bool {
+ self.0.has_capture
+ }
+
+ /// Returns true if and only if this NFA can match the empty string.
+ /// When it returns false, all possible matches are guaranteed to have a
+ /// non-zero length.
+ ///
+ /// This is useful as cheap way to know whether code needs to handle the
+ /// case of a zero length match. This is particularly important when UTF-8
+ /// modes are enabled, as when UTF-8 mode is enabled, empty matches that
+ /// split a codepoint must never be reported. This extra handling can
+ /// sometimes be costly, and since regexes matching an empty string are
+ /// somewhat rare, it can be beneficial to treat such regexes specially.
+ ///
+ /// # Example
+ ///
+ /// This example shows a few different NFAs and whether they match the
+ /// empty string or not. Notice the empty string isn't merely a matter
+ /// of a string of length literally `0`, but rather, whether a match can
+ /// occur between specific pairs of bytes.
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::syntax};
+ ///
+ /// // The empty regex matches the empty string.
+ /// let nfa = NFA::new("")?;
+ /// assert!(nfa.has_empty(), "empty matches empty");
+ /// // The '+' repetition operator requires at least one match, and so
+ /// // does not match the empty string.
+ /// let nfa = NFA::new("a+")?;
+ /// assert!(!nfa.has_empty(), "+ does not match empty");
+ /// // But the '*' repetition operator does.
+ /// let nfa = NFA::new("a*")?;
+ /// assert!(nfa.has_empty(), "* does match empty");
+ /// // And wrapping '+' in an operator that can match an empty string also
+ /// // causes it to match the empty string too.
+ /// let nfa = NFA::new("(a+)*")?;
+ /// assert!(nfa.has_empty(), "+ inside of * matches empty");
+ ///
+ /// // If a regex is just made of a look-around assertion, even if the
+ /// // assertion requires some kind of non-empty string around it (such as
+ /// // \b), then it is still treated as if it matches the empty string.
+ /// // Namely, if a match occurs of just a look-around assertion, then the
+ /// // match returned is empty.
+ /// let nfa = NFA::compiler()
+ /// .syntax(syntax::Config::new().utf8(false))
+ /// .build(r"^$\A\z\b\B(?-u:\b\B)")?;
+ /// assert!(nfa.has_empty(), "assertions match empty");
+ /// // Even when an assertion is wrapped in a '+', it still matches the
+ /// // empty string.
+ /// let nfa = NFA::new(r"\b+")?;
+ /// assert!(nfa.has_empty(), "+ of an assertion matches empty");
+ ///
+ /// // An alternation with even one branch that can match the empty string
+ /// // is also said to match the empty string overall.
+ /// let nfa = NFA::new("foo|(bar)?|quux")?;
+ /// assert!(nfa.has_empty(), "alternations can match empty");
+ ///
+ /// // An NFA that matches nothing does not match the empty string.
+ /// let nfa = NFA::new("[a&&b]")?;
+ /// assert!(!nfa.has_empty(), "never matching means not matching empty");
+ /// // But if it's wrapped in something that doesn't require a match at
+ /// // all, then it can match the empty string!
+ /// let nfa = NFA::new("[a&&b]*")?;
+ /// assert!(nfa.has_empty(), "* on never-match still matches empty");
+ /// // Since a '+' requires a match, using it on something that can never
+ /// // match will itself produce a regex that can never match anything,
+ /// // and thus does not match the empty string.
+ /// let nfa = NFA::new("[a&&b]+")?;
+ /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing");
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn has_empty(&self) -> bool {
+ self.0.has_empty
+ }
+
+ /// Whether UTF-8 mode is enabled for this NFA or not.
+ ///
+ /// When UTF-8 mode is enabled, all matches reported by a regex engine
+ /// derived from this NFA are guaranteed to correspond to spans of valid
+ /// UTF-8. This includes zero-width matches. For example, the regex engine
+ /// must guarantee that the empty regex will not match at the positions
+ /// between code units in the UTF-8 encoding of a single codepoint.
+ ///
+ /// See [`Config::utf8`] for more information.
+ ///
+ /// This is enabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how UTF-8 mode can impact the match spans that may
+ /// be reported in certain cases.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// Match, Input,
+ /// };
+ ///
+ /// let re = PikeVM::new("")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// // UTF-8 mode is enabled by default.
+ /// let mut input = Input::new("☃");
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
+ ///
+ /// // Even though an empty regex matches at 1..1, our next match is
+ /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
+ /// // three bytes long).
+ /// input.set_start(1);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
+ ///
+ /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build("")?;
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
+ ///
+ /// input.set_start(2);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
+ ///
+ /// input.set_start(3);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
+ ///
+ /// input.set_start(4);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_utf8(&self) -> bool {
+ self.0.utf8
+ }
+
+ /// Returns true when this NFA is meant to be matched in reverse.
+ ///
+ /// Generally speaking, when this is true, it means the NFA is supposed to
+ /// be used in conjunction with moving backwards through the haystack. That
+ /// is, from a higher memory address to a lower memory address.
+ ///
+ /// It is often the case that lower level routines dealing with an NFA
+ /// don't need to care about whether it is "meant" to be matched in reverse
+ /// or not. However, there are some specific cases where it matters. For
+ /// example, the implementation of CRLF-aware `^` and `$` line anchors
+ /// needs to know whether the search is in the forward or reverse
+ /// direction. In the forward direction, neither `^` nor `$` should match
+ /// when a `\r` has been seen previously and a `\n` is next. However, in
+ /// the reverse direction, neither `^` nor `$` should match when a `\n`
+ /// has been seen previously and a `\r` is next. This fundamentally changes
+ /// how the state machine is constructed, and thus needs to be altered
+ /// based on the direction of the search.
+ ///
+ /// This is automatically set when using a [`Compiler`] with a configuration
+ /// where [`Config::reverse`] is enabled. If you're building your own NFA
+ /// by hand via a [`Builder`]
+ #[inline]
+ pub fn is_reverse(&self) -> bool {
+ self.0.reverse
+ }
+
+ /// Returns true if and only if all starting states for this NFA correspond
+ /// to the beginning of an anchored search.
+ ///
+ /// Typically, an NFA will have both an anchored and an unanchored starting
+ /// state. Namely, because it tends to be useful to have both and the cost
+ /// of having an unanchored starting state is almost zero (for an NFA).
+ /// However, if all patterns in the NFA are themselves anchored, then even
+ /// the unanchored starting state will correspond to an anchored search
+ /// since the pattern doesn't permit anything else.
+ ///
+ /// # Example
+ ///
+ /// This example shows a few different scenarios where this method's
+ /// return value varies.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// // The unanchored starting state permits matching this pattern anywhere
+ /// // in a haystack, instead of just at the beginning.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(!nfa.is_always_start_anchored());
+ ///
+ /// // In this case, the pattern is itself anchored, so there is no way
+ /// // to run an unanchored search.
+ /// let nfa = NFA::new("^a")?;
+ /// assert!(nfa.is_always_start_anchored());
+ ///
+ /// // When multiline mode is enabled, '^' can match at the start of a line
+ /// // in addition to the start of a haystack, so an unanchored search is
+ /// // actually possible.
+ /// let nfa = NFA::new("(?m)^a")?;
+ /// assert!(!nfa.is_always_start_anchored());
+ ///
+ /// // Weird cases also work. A pattern is only considered anchored if all
+ /// // matches may only occur at the start of a haystack.
+ /// let nfa = NFA::new("(^a)|a")?;
+ /// assert!(!nfa.is_always_start_anchored());
+ ///
+ /// // When multiple patterns are present, if they are all anchored, then
+ /// // the NFA is always anchored too.
+ /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?;
+ /// assert!(nfa.is_always_start_anchored());
+ ///
+ /// // But if one pattern is unanchored, then the NFA must permit an
+ /// // unanchored search.
+ /// let nfa = NFA::new_many(&["^a", "b", "^c"])?;
+ /// assert!(!nfa.is_always_start_anchored());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_always_start_anchored(&self) -> bool {
+ self.start_anchored() == self.start_unanchored()
+ }
+
+ /// Returns the look-around matcher associated with this NFA.
+ ///
+ /// A look-around matcher determines how to match look-around assertions.
+ /// In particular, some assertions are configurable. For example, the
+ /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
+ /// from the default of `\n` to any other byte.
+ ///
+ /// If the NFA was built using a [`Compiler`], then this matcher
+ /// can be set via the [`Config::look_matcher`] configuration
+ /// knob. Otherwise, if you've built an NFA by hand, it is set via
+ /// [`Builder::set_look_matcher`].
+ ///
+ /// # Example
+ ///
+ /// This shows how to change the line terminator for multi-line assertions.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// util::look::LookMatcher,
+ /// Match, Input,
+ /// };
+ ///
+ /// let mut lookm = LookMatcher::new();
+ /// lookm.set_line_terminator(b'\x00');
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().look_matcher(lookm))
+ /// .build(r"(?m)^[a-z]+$")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// // Multi-line assertions now use NUL as a terminator.
+ /// assert_eq!(
+ /// Some(Match::must(0, 1..4)),
+ /// re.find(&mut cache, b"\x00abc\x00"),
+ /// );
+ /// // ... and \n is no longer recognized as a terminator.
+ /// assert_eq!(
+ /// None,
+ /// re.find(&mut cache, b"\nabc\n"),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn look_matcher(&self) -> &LookMatcher {
+ &self.0.look_matcher
+ }
+
+ /// Returns the union of all look-around assertions used throughout this
+ /// NFA. When the returned set is empty, it implies that the NFA has no
+ /// look-around assertions and thus zero conditional epsilon transitions.
+ ///
+ /// This is useful in some cases enabling optimizations. It is not
+ /// unusual, for example, for optimizations to be of the form, "for any
+ /// regex with zero conditional epsilon transitions, do ..." where "..."
+ /// is some kind of optimization.
+ ///
+ /// This isn't only helpful for optimizations either. Sometimes look-around
+ /// assertions are difficult to support. For example, many of the DFAs in
+ /// this crate don't support Unicode word boundaries or handle them using
+ /// heuristics. Handling that correctly typically requires some kind of
+ /// cheap check of whether the NFA has a Unicode word boundary in the first
+ /// place.
+ ///
+ /// # Example
+ ///
+ /// This example shows how this routine varies based on the regex pattern:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
+ ///
+ /// // No look-around at all.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(nfa.look_set_any().is_empty());
+ ///
+ /// // When multiple patterns are present, since this returns the union,
+ /// // it will include look-around assertions that only appear in one
+ /// // pattern.
+ /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?;
+ /// assert!(nfa.look_set_any().contains(Look::Start));
+ ///
+ /// // Some groups of assertions have various shortcuts. For example:
+ /// let nfa = NFA::new(r"(?-u:\b)")?;
+ /// assert!(nfa.look_set_any().contains_word());
+ /// assert!(!nfa.look_set_any().contains_word_unicode());
+ /// assert!(nfa.look_set_any().contains_word_ascii());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn look_set_any(&self) -> LookSet {
+ self.0.look_set_any
+ }
+
+ /// Returns the union of all prefix look-around assertions for every
+ /// pattern in this NFA. When the returned set is empty, it implies none of
+ /// the patterns require moving through a conditional epsilon transition
+ /// before inspecting the first byte in the haystack.
+ ///
+ /// This can be useful for determining what kinds of assertions need to be
+ /// satisfied at the beginning of a search. For example, typically DFAs
+ /// in this crate will build a distinct starting state for each possible
+ /// starting configuration that might result in look-around assertions
+ /// being satisfied differently. However, if the set returned here is
+ /// empty, then you know that the start state is invariant because there
+ /// are no conditional epsilon transitions to consider.
+ ///
+ /// # Example
+ ///
+ /// This example shows how this routine varies based on the regex pattern:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
+ ///
+ /// // No look-around at all.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(nfa.look_set_prefix_any().is_empty());
+ ///
+ /// // When multiple patterns are present, since this returns the union,
+ /// // it will include look-around assertions that only appear in one
+ /// // pattern. But it will only include assertions that are in the prefix
+ /// // of a pattern. For example, this includes '^' but not '$' even though
+ /// // '$' does appear.
+ /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?;
+ /// assert!(nfa.look_set_prefix_any().contains(Look::Start));
+ /// assert!(!nfa.look_set_prefix_any().contains(Look::End));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn look_set_prefix_any(&self) -> LookSet {
+ self.0.look_set_prefix_any
+ }
+
+ // FIXME: The `look_set_prefix_all` computation was not correct, and it
+ // seemed a little tricky to fix it. Since I wasn't actually using it for
+ // anything, I just decided to remove it in the run up to the regex 1.9
+ // release. If you need this, please file an issue.
+ /*
+ /// Returns the intersection of all prefix look-around assertions for every
+ /// pattern in this NFA. When the returned set is empty, it implies at
+ /// least one of the patterns does not require moving through a conditional
+ /// epsilon transition before inspecting the first byte in the haystack.
+ /// Conversely, when the set contains an assertion, it implies that every
+ /// pattern in the NFA also contains that assertion in its prefix.
+ ///
+ /// This can be useful for determining what kinds of assertions need to be
+ /// satisfied at the beginning of a search. For example, if you know that
+ /// [`Look::Start`] is in the prefix intersection set returned here, then
+ /// you know that all searches, regardless of input configuration, will be
+ /// anchored.
+ ///
+ /// # Example
+ ///
+ /// This example shows how this routine varies based on the regex pattern:
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::look::Look};
+ ///
+ /// // No look-around at all.
+ /// let nfa = NFA::new("a")?;
+ /// assert!(nfa.look_set_prefix_all().is_empty());
+ ///
+ /// // When multiple patterns are present, since this returns the
+ /// // intersection, it will only include assertions present in every
+ /// // prefix, and only the prefix.
+ /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?;
+ /// assert!(nfa.look_set_prefix_all().contains(Look::Start));
+ /// assert!(!nfa.look_set_prefix_all().contains(Look::End));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn look_set_prefix_all(&self) -> LookSet {
+ self.0.look_set_prefix_all
+ }
+ */
+
+ /// Returns the memory usage, in bytes, of this NFA.
+ ///
+ /// This does **not** include the stack size used up by this NFA. To
+ /// compute that, use `std::mem::size_of::<NFA>()`.
+ ///
+ /// # Example
+ ///
+ /// This example shows that large Unicode character classes can use quite
+ /// a bit of memory.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let nfa_unicode = NFA::new(r"\w")?;
+ /// let nfa_ascii = NFA::new(r"(?-u:\w)")?;
+ ///
+ /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn memory_usage(&self) -> usize {
+ use core::mem::size_of;
+
+ size_of::<Inner>() // allocated on the heap via Arc
+ + self.0.states.len() * size_of::<State>()
+ + self.0.start_pattern.len() * size_of::<StateID>()
+ + self.0.group_info.memory_usage()
+ + self.0.memory_extra
+ }
+}
+
+impl fmt::Debug for NFA {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ self.0.fmt(f)
+ }
+}
+
+/// The "inner" part of the NFA. We split this part out so that we can easily
+/// wrap it in an `Arc` above in the definition of `NFA`.
+///
+/// See builder.rs for the code that actually builds this type. This module
+/// does provide (internal) mutable methods for adding things to this
+/// NFA before finalizing it, but the high level construction process is
+/// controlled by the builder abstraction. (Which is complicated enough to
+/// get its own module.)
+#[derive(Default)]
+pub(super) struct Inner {
+ /// The state sequence. This sequence is guaranteed to be indexable by all
+ /// starting state IDs, and it is also guaranteed to contain at most one
+ /// `Match` state for each pattern compiled into this NFA. (A pattern may
+ /// not have a corresponding `Match` state if a `Match` state is impossible
+ /// to reach.)
+ states: Vec<State>,
+ /// The anchored starting state of this NFA.
+ start_anchored: StateID,
+ /// The unanchored starting state of this NFA.
+ start_unanchored: StateID,
+ /// The starting states for each individual pattern. Starting at any
+ /// of these states will result in only an anchored search for the
+ /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
+ /// contains a single regex, then `start_pattern[0]` and `start_anchored`
+ /// are always equivalent.
+ start_pattern: Vec<StateID>,
+ /// Info about the capturing groups in this NFA. This is responsible for
+ /// mapping groups to slots, mapping groups to names and names to groups.
+ group_info: GroupInfo,
+ /// A representation of equivalence classes over the transitions in this
+ /// NFA. Two bytes in the same equivalence class must not discriminate
+ /// between a match or a non-match. This map can be used to shrink the
+ /// total size of a DFA's transition table with a small match-time cost.
+ ///
+ /// Note that the NFA's transitions are *not* defined in terms of these
+ /// equivalence classes. The NFA's transitions are defined on the original
+ /// byte values. For the most part, this is because they wouldn't really
+ /// help the NFA much since the NFA already uses a sparse representation
+ /// to represent transitions. Byte classes are most effective in a dense
+ /// representation.
+ byte_class_set: ByteClassSet,
+ /// This is generated from `byte_class_set`, and essentially represents the
+ /// same thing but supports different access patterns. Namely, this permits
+ /// looking up the equivalence class of a byte very cheaply.
+ ///
+ /// Ideally we would just store this, but because of annoying code
+ /// structure reasons, we keep both this and `byte_class_set` around for
+ /// now. I think I would prefer that `byte_class_set` were computed in the
+ /// `Builder`, but right now, we compute it as states are added to the
+ /// `NFA`.
+ byte_classes: ByteClasses,
+ /// Whether this NFA has a `Capture` state anywhere.
+ has_capture: bool,
+ /// When the empty string is in the language matched by this NFA.
+ has_empty: bool,
+ /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that
+ /// all non-empty matches produced by this NFA correspond to spans of valid
+ /// UTF-8, and any empty matches produced by this NFA that split a UTF-8
+ /// encoded codepoint should be filtered out by the corresponding regex
+ /// engine.
+ utf8: bool,
+ /// Whether this NFA is meant to be matched in reverse or not.
+ reverse: bool,
+ /// The matcher to be used for look-around assertions.
+ look_matcher: LookMatcher,
+ /// The union of all look-around assertions that occur anywhere within
+ /// this NFA. If this set is empty, then it means there are precisely zero
+ /// conditional epsilon transitions in the NFA.
+ look_set_any: LookSet,
+ /// The union of all look-around assertions that occur as a zero-length
+ /// prefix for any of the patterns in this NFA.
+ look_set_prefix_any: LookSet,
+ /*
+ /// The intersection of all look-around assertions that occur as a
+ /// zero-length prefix for any of the patterns in this NFA.
+ look_set_prefix_all: LookSet,
+ */
+ /// Heap memory used indirectly by NFA states and other things (like the
+ /// various capturing group representations above). Since each state
+ /// might use a different amount of heap, we need to keep track of this
+ /// incrementally.
+ memory_extra: usize,
+}
+
+impl Inner {
+ /// Runs any last finalization bits and turns this into a full NFA.
+ pub(super) fn into_nfa(mut self) -> NFA {
+ self.byte_classes = self.byte_class_set.byte_classes();
+ // Do epsilon closure from the start state of every pattern in order
+ // to compute various properties such as look-around assertions and
+ // whether the empty string can be matched.
+ let mut stack = vec![];
+ let mut seen = SparseSet::new(self.states.len());
+ for &start_id in self.start_pattern.iter() {
+ stack.push(start_id);
+ seen.clear();
+ // let mut prefix_all = LookSet::full();
+ let mut prefix_any = LookSet::empty();
+ while let Some(sid) = stack.pop() {
+ if !seen.insert(sid) {
+ continue;
+ }
+ match self.states[sid] {
+ State::ByteRange { .. }
+ | State::Dense { .. }
+ | State::Fail => continue,
+ State::Sparse(_) => {
+ // This snippet below will rewrite this sparse state
+ // as a dense state. By doing it here, we apply this
+ // optimization to all hot "sparse" states since these
+ // are the states that are reachable from the start
+ // state via an epsilon closure.
+ //
+ // Unfortunately, this optimization did not seem to
+ // help much in some very limited ad hoc benchmarking.
+ //
+ // I left the 'Dense' state type in place in case we
+ // want to revisit this, but I suspect the real way
+ // to make forward progress is a more fundamental
+ // rearchitecting of how data in the NFA is laid out.
+ // I think we should consider a single contiguous
+ // allocation instead of all this indirection and
+ // potential heap allocations for every state. But this
+ // is a large re-design and will require API breaking
+ // changes.
+ // self.memory_extra -= self.states[sid].memory_usage();
+ // let trans = DenseTransitions::from_sparse(sparse);
+ // self.states[sid] = State::Dense(trans);
+ // self.memory_extra += self.states[sid].memory_usage();
+ continue;
+ }
+ State::Match { .. } => self.has_empty = true,
+ State::Look { look, next } => {
+ prefix_any = prefix_any.insert(look);
+ stack.push(next);
+ }
+ State::Union { ref alternates } => {
+ // Order doesn't matter here, since we're just dealing
+ // with look-around sets. But if we do richer analysis
+ // here that needs to care about preference order, then
+ // this should be done in reverse.
+ stack.extend(alternates.iter());
+ }
+ State::BinaryUnion { alt1, alt2 } => {
+ stack.push(alt2);
+ stack.push(alt1);
+ }
+ State::Capture { next, .. } => {
+ stack.push(next);
+ }
+ }
+ }
+ self.look_set_prefix_any =
+ self.look_set_prefix_any.union(prefix_any);
+ }
+ NFA(Arc::new(self))
+ }
+
+ /// Returns the capturing group info for this NFA.
+ pub(super) fn group_info(&self) -> &GroupInfo {
+ &self.group_info
+ }
+
+ /// Add the given state to this NFA after allocating a fresh identifier for
+ /// it.
+ ///
+ /// This panics if too many states are added such that a fresh identifier
+ /// could not be created. (Currently, the only caller of this routine is
+ /// a `Builder`, and it upholds this invariant.)
+ pub(super) fn add(&mut self, state: State) -> StateID {
+ match state {
+ State::ByteRange { ref trans } => {
+ self.byte_class_set.set_range(trans.start, trans.end);
+ }
+ State::Sparse(ref sparse) => {
+ for trans in sparse.transitions.iter() {
+ self.byte_class_set.set_range(trans.start, trans.end);
+ }
+ }
+ State::Dense { .. } => unreachable!(),
+ State::Look { look, .. } => {
+ self.look_matcher
+ .add_to_byteset(look, &mut self.byte_class_set);
+ self.look_set_any = self.look_set_any.insert(look);
+ }
+ State::Capture { .. } => {
+ self.has_capture = true;
+ }
+ State::Union { .. }
+ | State::BinaryUnion { .. }
+ | State::Fail
+ | State::Match { .. } => {}
+ }
+
+ let id = StateID::new(self.states.len()).unwrap();
+ self.memory_extra += state.memory_usage();
+ self.states.push(state);
+ id
+ }
+
+ /// Set the starting state identifiers for this NFA.
+ ///
+ /// `start_anchored` and `start_unanchored` may be equivalent. When they
+ /// are, then the NFA can only execute anchored searches. This might
+ /// occur, for example, for patterns that are unconditionally anchored.
+ /// e.g., `^foo`.
+ pub(super) fn set_starts(
+ &mut self,
+ start_anchored: StateID,
+ start_unanchored: StateID,
+ start_pattern: &[StateID],
+ ) {
+ self.start_anchored = start_anchored;
+ self.start_unanchored = start_unanchored;
+ self.start_pattern = start_pattern.to_vec();
+ }
+
+ /// Sets the UTF-8 mode of this NFA.
+ pub(super) fn set_utf8(&mut self, yes: bool) {
+ self.utf8 = yes;
+ }
+
+ /// Sets the reverse mode of this NFA.
+ pub(super) fn set_reverse(&mut self, yes: bool) {
+ self.reverse = yes;
+ }
+
+ /// Sets the look-around assertion matcher for this NFA.
+ pub(super) fn set_look_matcher(&mut self, m: LookMatcher) {
+ self.look_matcher = m;
+ }
+
+ /// Set the capturing groups for this NFA.
+ ///
+ /// The given slice should contain the capturing groups for each pattern,
+ /// The capturing groups in turn should correspond to the total number of
+ /// capturing groups in the pattern, including the anonymous first capture
+ /// group for each pattern. If a capturing group does have a name, then it
+ /// should be provided as a Arc<str>.
+ ///
+ /// This returns an error if a corresponding `GroupInfo` could not be
+ /// built.
+ pub(super) fn set_captures(
+ &mut self,
+ captures: &[Vec<Option<Arc<str>>>],
+ ) -> Result<(), GroupInfoError> {
+ self.group_info = GroupInfo::new(
+ captures.iter().map(|x| x.iter().map(|y| y.as_ref())),
+ )?;
+ Ok(())
+ }
+
+ /// Remap the transitions in every state of this NFA using the given map.
+ /// The given map should be indexed according to state ID namespace used by
+ /// the transitions of the states currently in this NFA.
+ ///
+ /// This is particularly useful to the NFA builder, since it is convenient
+ /// to add NFA states in order to produce their final IDs. Then, after all
+ /// of the intermediate "empty" states (unconditional epsilon transitions)
+ /// have been removed from the builder's representation, we can re-map all
+ /// of the transitions in the states already added to their final IDs.
+ pub(super) fn remap(&mut self, old_to_new: &[StateID]) {
+ for state in &mut self.states {
+ state.remap(old_to_new);
+ }
+ self.start_anchored = old_to_new[self.start_anchored];
+ self.start_unanchored = old_to_new[self.start_unanchored];
+ for id in self.start_pattern.iter_mut() {
+ *id = old_to_new[*id];
+ }
+ }
+}
+
+impl fmt::Debug for Inner {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ writeln!(f, "thompson::NFA(")?;
+ for (sid, state) in self.states.iter().with_state_ids() {
+ let status = if sid == self.start_anchored {
+ '^'
+ } else if sid == self.start_unanchored {
+ '>'
+ } else {
+ ' '
+ };
+ writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?;
+ }
+ let pattern_len = self.start_pattern.len();
+ if pattern_len > 1 {
+ writeln!(f, "")?;
+ for pid in 0..pattern_len {
+ let sid = self.start_pattern[pid];
+ writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?;
+ }
+ }
+ writeln!(f, "")?;
+ writeln!(
+ f,
+ "transition equivalence classes: {:?}",
+ self.byte_classes,
+ )?;
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// A state in an NFA.
+///
+/// In theory, it can help to conceptualize an `NFA` as a graph consisting of
+/// `State`s. Each `State` contains its complete set of outgoing transitions.
+///
+/// In practice, it can help to conceptualize an `NFA` as a sequence of
+/// instructions for a virtual machine. Each `State` says what to do and where
+/// to go next.
+///
+/// Strictly speaking, the practical interpretation is the most correct one,
+/// because of the [`Capture`](State::Capture) state. Namely, a `Capture`
+/// state always forwards execution to another state unconditionally. Its only
+/// purpose is to cause a side effect: the recording of the current input
+/// position at a particular location in memory. In this sense, an `NFA`
+/// has more power than a theoretical non-deterministic finite automaton.
+///
+/// For most uses of this crate, it is likely that one may never even need to
+/// be aware of this type at all. The main use cases for looking at `State`s
+/// directly are if you need to write your own search implementation or if you
+/// need to do some kind of analysis on the NFA.
+#[derive(Clone, Eq, PartialEq)]
+pub enum State {
+ /// A state with a single transition that can only be taken if the current
+ /// input symbol is in a particular range of bytes.
+ ByteRange {
+ /// The transition from this state to the next.
+ trans: Transition,
+ },
+ /// A state with possibly many transitions represented in a sparse fashion.
+ /// Transitions are non-overlapping and ordered lexicographically by input
+ /// range.
+ ///
+ /// In practice, this is used for encoding UTF-8 automata. Its presence is
+ /// primarily an optimization that avoids many additional unconditional
+ /// epsilon transitions (via [`Union`](State::Union) states), and thus
+ /// decreases the overhead of traversing the NFA. This can improve both
+ /// matching time and DFA construction time.
+ Sparse(SparseTransitions),
+ /// A dense representation of a state with multiple transitions.
+ Dense(DenseTransitions),
+ /// A conditional epsilon transition satisfied via some sort of
+ /// look-around. Look-around is limited to anchor and word boundary
+ /// assertions.
+ ///
+ /// Look-around states are meant to be evaluated while performing epsilon
+ /// closure (computing the set of states reachable from a particular state
+ /// via only epsilon transitions). If the current position in the haystack
+ /// satisfies the look-around assertion, then you're permitted to follow
+ /// that epsilon transition.
+ Look {
+ /// The look-around assertion that must be satisfied before moving
+ /// to `next`.
+ look: Look,
+ /// The state to transition to if the look-around assertion is
+ /// satisfied.
+ next: StateID,
+ },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union {
+ /// An ordered sequence of unconditional epsilon transitions to other
+ /// states. Transitions earlier in the sequence are preferred over
+ /// transitions later in the sequence.
+ alternates: Box<[StateID]>,
+ },
+ /// An alternation such that there exists precisely two unconditional
+ /// epsilon transitions, where matches found via `alt1` are preferred over
+ /// matches found via `alt2`.
+ ///
+ /// This state exists as a common special case of Union where there are
+ /// only two alternates. In this case, we don't need any allocations to
+ /// represent the state. This saves a bit of memory and also saves an
+ /// additional memory access when traversing the NFA.
+ BinaryUnion {
+ /// An unconditional epsilon transition to another NFA state. This
+ /// is preferred over `alt2`.
+ alt1: StateID,
+ /// An unconditional epsilon transition to another NFA state. Matches
+ /// reported via this transition should only be reported if no matches
+ /// were found by following `alt1`.
+ alt2: StateID,
+ },
+ /// An empty state that records a capture location.
+ ///
+ /// From the perspective of finite automata, this is precisely equivalent
+ /// to an unconditional epsilon transition, but serves the purpose of
+ /// instructing NFA simulations to record additional state when the finite
+ /// state machine passes through this epsilon transition.
+ ///
+ /// `slot` in this context refers to the specific capture group slot
+ /// offset that is being recorded. Each capturing group has two slots
+ /// corresponding to the start and end of the matching portion of that
+ /// group.
+ ///
+ /// The pattern ID and capture group index are also included in this state
+ /// in case they are useful. But mostly, all you'll need is `next` and
+ /// `slot`.
+ Capture {
+ /// The state to transition to, unconditionally.
+ next: StateID,
+ /// The pattern ID that this capture belongs to.
+ pattern_id: PatternID,
+ /// The capture group index that this capture belongs to. Capture group
+ /// indices are local to each pattern. For example, when capturing
+ /// groups are enabled, every pattern has a capture group at index
+ /// `0`.
+ group_index: SmallIndex,
+ /// The slot index for this capture. Every capturing group has two
+ /// slots: one for the start haystack offset and one for the end
+ /// haystack offset. Unlike capture group indices, slot indices are
+ /// global across all patterns in this NFA. That is, each slot belongs
+ /// to a single pattern, but there is only one slot at index `i`.
+ slot: SmallIndex,
+ },
+ /// A state that cannot be transitioned out of. This is useful for cases
+ /// where you want to prevent matching from occurring. For example, if your
+ /// regex parser permits empty character classes, then one could choose
+ /// a `Fail` state to represent them. (An empty character class can be
+ /// thought of as an empty set. Since nothing is in an empty set, they can
+ /// never match anything.)
+ Fail,
+ /// A match state. There is at least one such occurrence of this state for
+ /// each regex that can match that is in this NFA.
+ Match {
+ /// The matching pattern ID.
+ pattern_id: PatternID,
+ },
+}
+
+impl State {
+ /// Returns true if and only if this state contains one or more epsilon
+ /// transitions.
+ ///
+ /// In practice, a state has no outgoing transitions (like `Match`), has
+ /// only non-epsilon transitions (like `ByteRange`) or has only epsilon
+ /// transitions (like `Union`).
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{State, Transition},
+ /// util::primitives::{PatternID, StateID, SmallIndex},
+ /// };
+ ///
+ /// // Capture states are epsilon transitions.
+ /// let state = State::Capture {
+ /// next: StateID::ZERO,
+ /// pattern_id: PatternID::ZERO,
+ /// group_index: SmallIndex::ZERO,
+ /// slot: SmallIndex::ZERO,
+ /// };
+ /// assert!(state.is_epsilon());
+ ///
+ /// // ByteRange states are not.
+ /// let state = State::ByteRange {
+ /// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO },
+ /// };
+ /// assert!(!state.is_epsilon());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[inline]
+ pub fn is_epsilon(&self) -> bool {
+ match *self {
+ State::ByteRange { .. }
+ | State::Sparse { .. }
+ | State::Dense { .. }
+ | State::Fail
+ | State::Match { .. } => false,
+ State::Look { .. }
+ | State::Union { .. }
+ | State::BinaryUnion { .. }
+ | State::Capture { .. } => true,
+ }
+ }
+
+ /// Returns the heap memory usage of this NFA state in bytes.
+ fn memory_usage(&self) -> usize {
+ match *self {
+ State::ByteRange { .. }
+ | State::Look { .. }
+ | State::BinaryUnion { .. }
+ | State::Capture { .. }
+ | State::Match { .. }
+ | State::Fail => 0,
+ State::Sparse(SparseTransitions { ref transitions }) => {
+ transitions.len() * mem::size_of::<Transition>()
+ }
+ State::Dense { .. } => 256 * mem::size_of::<StateID>(),
+ State::Union { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ }
+ }
+
+ /// Remap the transitions in this state using the given map. Namely, the
+ /// given map should be indexed according to the transitions currently
+ /// in this state.
+ ///
+ /// This is used during the final phase of the NFA compiler, which turns
+ /// its intermediate NFA into the final NFA.
+ fn remap(&mut self, remap: &[StateID]) {
+ match *self {
+ State::ByteRange { ref mut trans } => {
+ trans.next = remap[trans.next]
+ }
+ State::Sparse(SparseTransitions { ref mut transitions }) => {
+ for t in transitions.iter_mut() {
+ t.next = remap[t.next];
+ }
+ }
+ State::Dense(DenseTransitions { ref mut transitions }) => {
+ for sid in transitions.iter_mut() {
+ *sid = remap[*sid];
+ }
+ }
+ State::Look { ref mut next, .. } => *next = remap[*next],
+ State::Union { ref mut alternates } => {
+ for alt in alternates.iter_mut() {
+ *alt = remap[*alt];
+ }
+ }
+ State::BinaryUnion { ref mut alt1, ref mut alt2 } => {
+ *alt1 = remap[*alt1];
+ *alt2 = remap[*alt2];
+ }
+ State::Capture { ref mut next, .. } => *next = remap[*next],
+ State::Fail => {}
+ State::Match { .. } => {}
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match *self {
+ State::ByteRange { ref trans } => trans.fmt(f),
+ State::Sparse(SparseTransitions { ref transitions }) => {
+ let rs = transitions
+ .iter()
+ .map(|t| format!("{:?}", t))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "sparse({})", rs)
+ }
+ State::Dense(ref dense) => {
+ write!(f, "dense(")?;
+ for (i, t) in dense.iter().enumerate() {
+ if i > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?}", t)?;
+ }
+ write!(f, ")")
+ }
+ State::Look { ref look, next } => {
+ write!(f, "{:?} => {:?}", look, next.as_usize())
+ }
+ State::Union { ref alternates } => {
+ let alts = alternates
+ .iter()
+ .map(|id| format!("{:?}", id.as_usize()))
+ .collect::<Vec<String>>()
+ .join(", ");
+ write!(f, "union({})", alts)
+ }
+ State::BinaryUnion { alt1, alt2 } => {
+ write!(
+ f,
+ "binary-union({}, {})",
+ alt1.as_usize(),
+ alt2.as_usize()
+ )
+ }
+ State::Capture { next, pattern_id, group_index, slot } => {
+ write!(
+ f,
+ "capture(pid={:?}, group={:?}, slot={:?}) => {:?}",
+ pattern_id.as_usize(),
+ group_index.as_usize(),
+ slot.as_usize(),
+ next.as_usize(),
+ )
+ }
+ State::Fail => write!(f, "FAIL"),
+ State::Match { pattern_id } => {
+ write!(f, "MATCH({:?})", pattern_id.as_usize())
+ }
+ }
+ }
+}
+
+/// A sequence of transitions used to represent a sparse state.
+///
+/// This is the primary representation of a [`Sparse`](State::Sparse) state.
+/// It corresponds to a sorted sequence of transitions with non-overlapping
+/// byte ranges. If the byte at the current position in the haystack matches
+/// one of the byte ranges, then the finite state machine should take the
+/// corresponding transition.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct SparseTransitions {
+ /// The sorted sequence of non-overlapping transitions.
+ pub transitions: Box<[Transition]>,
+}
+
+impl SparseTransitions {
+ /// This follows the matching transition for a particular byte.
+ ///
+ /// The matching transition is found by looking for a matching byte
+ /// range (there is at most one) corresponding to the position `at` in
+ /// `haystack`.
+ ///
+ /// If `at >= haystack.len()`, then this returns `None`.
+ #[inline]
+ pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
+ haystack.get(at).and_then(|&b| self.matches_byte(b))
+ }
+
+ /// This follows the matching transition for any member of the alphabet.
+ ///
+ /// The matching transition is found by looking for a matching byte
+ /// range (there is at most one) corresponding to the position `at` in
+ /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi),
+ /// then this always returns `None`.
+ #[inline]
+ pub(crate) fn matches_unit(
+ &self,
+ unit: alphabet::Unit,
+ ) -> Option<StateID> {
+ unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
+ }
+
+ /// This follows the matching transition for a particular byte.
+ ///
+ /// The matching transition is found by looking for a matching byte range
+ /// (there is at most one) corresponding to the byte given.
+ #[inline]
+ pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
+ for t in self.transitions.iter() {
+ if t.start > byte {
+ break;
+ } else if t.matches_byte(byte) {
+ return Some(t.next);
+ }
+ }
+ None
+
+ /*
+ // This is an alternative implementation that uses binary search. In
+ // some ad hoc experiments, like
+ //
+ // smallishru=OpenSubtitles2018.raw.sample.smallish.ru
+ // regex-cli find nfa thompson pikevm -b "@$smallishru" '\b\w+\b'
+ //
+ // I could not observe any improvement, and in fact, things seemed to
+ // be a bit slower. I can see an improvement in at least one benchmark:
+ //
+ // allcpssmall=all-codepoints-utf8-10x
+ // regex-cli find nfa thompson pikevm @$allcpssmall '\pL{100}'
+ //
+ // Where total search time goes from 3.2s to 2.4s when using binary
+ // search.
+ self.transitions
+ .binary_search_by(|t| {
+ if t.end < byte {
+ core::cmp::Ordering::Less
+ } else if t.start > byte {
+ core::cmp::Ordering::Greater
+ } else {
+ core::cmp::Ordering::Equal
+ }
+ })
+ .ok()
+ .map(|i| self.transitions[i].next)
+ */
+ }
+}
+
+/// A sequence of transitions used to represent a dense state.
+///
+/// This is the primary representation of a [`Dense`](State::Dense) state. It
+/// provides constant time matching. That is, given a byte in a haystack and
+/// a `DenseTransitions`, one can determine if the state matches in constant
+/// time.
+///
+/// This is in contrast to `SparseTransitions`, whose time complexity is
+/// necessarily bigger than constant time. Also in contrast, `DenseTransitions`
+/// usually requires (much) more heap memory.
+#[derive(Clone, Debug, Eq, PartialEq)]
+pub struct DenseTransitions {
+ /// A dense representation of this state's transitions on the heap. This
+ /// always has length 256.
+ pub transitions: Box<[StateID]>,
+}
+
+impl DenseTransitions {
+ /// This follows the matching transition for a particular byte.
+ ///
+ /// The matching transition is found by looking for a transition that
+ /// doesn't correspond to `StateID::ZERO` for the byte `at` the given
+ /// position in `haystack`.
+ ///
+ /// If `at >= haystack.len()`, then this returns `None`.
+ #[inline]
+ pub fn matches(&self, haystack: &[u8], at: usize) -> Option<StateID> {
+ haystack.get(at).and_then(|&b| self.matches_byte(b))
+ }
+
+ /// This follows the matching transition for any member of the alphabet.
+ ///
+ /// The matching transition is found by looking for a transition that
+ /// doesn't correspond to `StateID::ZERO` for the byte `at` the given
+ /// position in `haystack`.
+ ///
+ /// If `at >= haystack.len()` or if the given alphabet unit is
+ /// [`EOI`](alphabet::Unit::eoi), then this returns `None`.
+ #[inline]
+ pub(crate) fn matches_unit(
+ &self,
+ unit: alphabet::Unit,
+ ) -> Option<StateID> {
+ unit.as_u8().map_or(None, |byte| self.matches_byte(byte))
+ }
+
+ /// This follows the matching transition for a particular byte.
+ ///
+ /// The matching transition is found by looking for a transition that
+ /// doesn't correspond to `StateID::ZERO` for the given `byte`.
+ ///
+ /// If `at >= haystack.len()`, then this returns `None`.
+ #[inline]
+ pub fn matches_byte(&self, byte: u8) -> Option<StateID> {
+ let next = self.transitions[usize::from(byte)];
+ if next == StateID::ZERO {
+ None
+ } else {
+ Some(next)
+ }
+ }
+
+ /*
+ /// The dense state optimization isn't currently enabled, so permit a
+ /// little bit of dead code.
+ pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions {
+ let mut dense = vec![StateID::ZERO; 256];
+ for t in sparse.transitions.iter() {
+ for b in t.start..=t.end {
+ dense[usize::from(b)] = t.next;
+ }
+ }
+ DenseTransitions { transitions: dense.into_boxed_slice() }
+ }
+ */
+
+ /// Returns an iterator over all transitions that don't point to
+ /// `StateID::ZERO`.
+ pub(crate) fn iter(&self) -> impl Iterator<Item = Transition> + '_ {
+ use crate::util::int::Usize;
+ self.transitions
+ .iter()
+ .enumerate()
+ .filter(|&(_, &sid)| sid != StateID::ZERO)
+ .map(|(byte, &next)| Transition {
+ start: byte.as_u8(),
+ end: byte.as_u8(),
+ next,
+ })
+ }
+}
+
+/// A single transition to another state.
+///
+/// This transition may only be followed if the current byte in the haystack
+/// falls in the inclusive range of bytes specified.
+#[derive(Clone, Copy, Eq, Hash, PartialEq)]
+pub struct Transition {
+ /// The inclusive start of the byte range.
+ pub start: u8,
+ /// The inclusive end of the byte range.
+ pub end: u8,
+ /// The identifier of the state to transition to.
+ pub next: StateID,
+}
+
+impl Transition {
+ /// Returns true if the position `at` in `haystack` falls in this
+ /// transition's range of bytes.
+ ///
+ /// If `at >= haystack.len()`, then this returns `false`.
+ pub fn matches(&self, haystack: &[u8], at: usize) -> bool {
+ haystack.get(at).map_or(false, |&b| self.matches_byte(b))
+ }
+
+ /// Returns true if the given alphabet unit falls in this transition's
+ /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then
+ /// this returns `false`.
+ pub fn matches_unit(&self, unit: alphabet::Unit) -> bool {
+ unit.as_u8().map_or(false, |byte| self.matches_byte(byte))
+ }
+
+ /// Returns true if the given byte falls in this transition's range of
+ /// bytes.
+ pub fn matches_byte(&self, byte: u8) -> bool {
+ self.start <= byte && byte <= self.end
+ }
+}
+
+impl fmt::Debug for Transition {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ use crate::util::escape::DebugByte;
+
+ let Transition { start, end, next } = *self;
+ if self.start == self.end {
+ write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())
+ } else {
+ write!(
+ f,
+ "{:?}-{:?} => {:?}",
+ DebugByte(start),
+ DebugByte(end),
+ next.as_usize(),
+ )
+ }
+ }
+}
+
+/// An iterator over all pattern IDs in an NFA.
+///
+/// This iterator is created by [`NFA::patterns`].
+///
+/// The lifetime parameter `'a` refers to the lifetime of the NFA from which
+/// this pattern iterator was created.
+#[derive(Debug)]
+pub struct PatternIter<'a> {
+ it: PatternIDIter,
+ /// We explicitly associate a lifetime with this iterator even though we
+ /// don't actually borrow anything from the NFA. We do this for backward
+ /// compatibility purposes. If we ever do need to borrow something from
+ /// the NFA, then we can and just get rid of this marker without breaking
+ /// the public API.
+ _marker: core::marker::PhantomData<&'a ()>,
+}
+
+impl<'a> Iterator for PatternIter<'a> {
+ type Item = PatternID;
+
+ fn next(&mut self) -> Option<PatternID> {
+ self.it.next()
+ }
+}
+
+#[cfg(all(test, feature = "nfa-pikevm"))]
+mod tests {
+ use super::*;
+ use crate::{nfa::thompson::pikevm::PikeVM, Input};
+
+ // This asserts that an NFA state doesn't have its size changed. It is
+ // *really* easy to accidentally increase the size, and thus potentially
+ // dramatically increase the memory usage of every NFA.
+ //
+ // This assert doesn't mean we absolutely cannot increase the size of an
+ // NFA state. We can. It's just here to make sure we do it knowingly and
+ // intentionally.
+ #[test]
+ fn state_has_small_size() {
+ #[cfg(target_pointer_width = "64")]
+ assert_eq!(24, core::mem::size_of::<State>());
+ #[cfg(target_pointer_width = "32")]
+ assert_eq!(20, core::mem::size_of::<State>());
+ }
+
+ #[test]
+ fn always_match() {
+ let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap();
+ let mut cache = re.create_cache();
+ let mut caps = re.create_captures();
+ let mut find = |haystack, start, end| {
+ let input = Input::new(haystack).range(start..end);
+ re.search(&mut cache, &input, &mut caps);
+ caps.get_match().map(|m| m.end())
+ };
+
+ assert_eq!(Some(0), find("", 0, 0));
+ assert_eq!(Some(0), find("a", 0, 1));
+ assert_eq!(Some(1), find("a", 1, 1));
+ assert_eq!(Some(0), find("ab", 0, 2));
+ assert_eq!(Some(1), find("ab", 1, 2));
+ assert_eq!(Some(2), find("ab", 2, 2));
+ }
+
+ #[test]
+ fn never_match() {
+ let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap();
+ let mut cache = re.create_cache();
+ let mut caps = re.create_captures();
+ let mut find = |haystack, start, end| {
+ let input = Input::new(haystack).range(start..end);
+ re.search(&mut cache, &input, &mut caps);
+ caps.get_match().map(|m| m.end())
+ };
+
+ assert_eq!(None, find("", 0, 0));
+ assert_eq!(None, find("a", 0, 1));
+ assert_eq!(None, find("a", 1, 1));
+ assert_eq!(None, find("ab", 0, 2));
+ assert_eq!(None, find("ab", 1, 2));
+ assert_eq!(None, find("ab", 2, 2));
+ }
+}