summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/src/dfa.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 14:29:10 +0000
commit2aa4a82499d4becd2284cdb482213d541b8804dd (patch)
treeb80bf8bf13c3766139fbacc530efd0dd9d54394c /third_party/rust/regex/src/dfa.rs
parentInitial commit. (diff)
downloadfirefox-2aa4a82499d4becd2284cdb482213d541b8804dd.tar.xz
firefox-2aa4a82499d4becd2284cdb482213d541b8804dd.zip
Adding upstream version 86.0.1.upstream/86.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/regex/src/dfa.rs')
-rw-r--r--third_party/rust/regex/src/dfa.rs1942
1 files changed, 1942 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/dfa.rs b/third_party/rust/regex/src/dfa.rs
new file mode 100644
index 0000000000..decc3b9874
--- /dev/null
+++ b/third_party/rust/regex/src/dfa.rs
@@ -0,0 +1,1942 @@
+/*!
+The DFA matching engine.
+
+A DFA provides faster matching because the engine is in exactly one state at
+any point in time. In the NFA, there may be multiple active states, and
+considerable CPU cycles are spent shuffling them around. In finite automata
+speak, the DFA follows epsilon transitions in the regex far less than the NFA.
+
+A DFA is a classic trade off between time and space. The NFA is slower, but
+its memory requirements are typically small and predictable. The DFA is faster,
+but given the right regex and the right input, the number of states in the
+DFA can grow exponentially. To mitigate this space problem, we do two things:
+
+1. We implement an *online* DFA. That is, the DFA is constructed from the NFA
+ during a search. When a new state is computed, it is stored in a cache so
+ that it may be reused. An important consequence of this implementation
+ is that states that are never reached for a particular input are never
+ computed. (This is impossible in an "offline" DFA which needs to compute
+ all possible states up front.)
+2. If the cache gets too big, we wipe it and continue matching.
+
+In pathological cases, a new state can be created for every byte of input.
+(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.)
+In this case, performance regresses to slightly slower than the full NFA
+simulation, in large part because the cache becomes useless. If the cache
+is wiped too frequently, the DFA quits and control falls back to one of the
+NFA simulations.
+
+Because of the "lazy" nature of this DFA, the inner matching loop is
+considerably more complex than one might expect out of a DFA. A number of
+tricks are employed to make it fast. Tread carefully.
+
+N.B. While this implementation is heavily commented, Russ Cox's series of
+articles on regexes is strongly recommended: https://swtch.com/~rsc/regexp/
+(As is the DFA implementation in RE2, which heavily influenced this
+implementation.)
+*/
+
+use std::collections::HashMap;
+use std::fmt;
+use std::iter::repeat;
+use std::mem;
+use std::sync::Arc;
+
+use exec::ProgramCache;
+use prog::{Inst, Program};
+use sparse::SparseSet;
+
+/// Return true if and only if the given program can be executed by a DFA.
+///
+/// Generally, a DFA is always possible. A pathological case where it is not
+/// possible is if the number of NFA states exceeds `u32::MAX`, in which case,
+/// this function will return false.
+///
+/// This function will also return false if the given program has any Unicode
+/// instructions (Char or Ranges) since the DFA operates on bytes only.
+pub fn can_exec(insts: &Program) -> bool {
+ use prog::Inst::*;
+ // If for some reason we manage to allocate a regex program with more
+ // than i32::MAX instructions, then we can't execute the DFA because we
+ // use 32 bit instruction pointer deltas for memory savings.
+ // If i32::MAX is the largest positive delta,
+ // then -i32::MAX == i32::MIN + 1 is the largest negative delta,
+ // and we are OK to use 32 bits.
+ if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
+ return false;
+ }
+ for inst in insts {
+ match *inst {
+ Char(_) | Ranges(_) => return false,
+ EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
+ }
+ }
+ true
+}
+
+/// A reusable cache of DFA states.
+///
+/// This cache is reused between multiple invocations of the same regex
+/// program. (It is not shared simultaneously between threads. If there is
+/// contention, then new caches are created.)
+#[derive(Debug)]
+pub struct Cache {
+ /// Group persistent DFA related cache state together. The sparse sets
+ /// listed below are used as scratch space while computing uncached states.
+ inner: CacheInner,
+ /// qcur and qnext are ordered sets with constant time
+ /// addition/membership/clearing-whole-set and linear time iteration. They
+ /// are used to manage the sets of NFA states in DFA states when computing
+ /// cached DFA states. In particular, the order of the NFA states matters
+ /// for leftmost-first style matching. Namely, when computing a cached
+ /// state, the set of NFA states stops growing as soon as the first Match
+ /// instruction is observed.
+ qcur: SparseSet,
+ qnext: SparseSet,
+}
+
+/// `CacheInner` is logically just a part of Cache, but groups together fields
+/// that aren't passed as function parameters throughout search. (This split
+/// is mostly an artifact of the borrow checker. It is happily paid.)
+#[derive(Debug)]
+struct CacheInner {
+ /// A cache of pre-compiled DFA states, keyed by the set of NFA states
+ /// and the set of empty-width flags set at the byte in the input when the
+ /// state was observed.
+ ///
+ /// A StatePtr is effectively a `*State`, but to avoid various inconvenient
+ /// things, we just pass indexes around manually. The performance impact of
+ /// this is probably an instruction or two in the inner loop. However, on
+ /// 64 bit, each StatePtr is half the size of a *State.
+ compiled: StateMap,
+ /// The transition table.
+ ///
+ /// The transition table is laid out in row-major order, where states are
+ /// rows and the transitions for each state are columns. At a high level,
+ /// given state `s` and byte `b`, the next state can be found at index
+ /// `s * 256 + b`.
+ ///
+ /// This is, of course, a lie. A StatePtr is actually a pointer to the
+ /// *start* of a row in this table. When indexing in the DFA's inner loop,
+ /// this removes the need to multiply the StatePtr by the stride. Yes, it
+ /// matters. This reduces the number of states we can store, but: the
+ /// stride is rarely 256 since we define transitions in terms of
+ /// *equivalence classes* of bytes. Each class corresponds to a set of
+ /// bytes that never discriminate a distinct path through the DFA from each
+ /// other.
+ trans: Transitions,
+ /// A set of cached start states, which are limited to the number of
+ /// permutations of flags set just before the initial byte of input. (The
+ /// index into this vec is a `EmptyFlags`.)
+ ///
+ /// N.B. A start state can be "dead" (i.e., no possible match), so we
+ /// represent it with a StatePtr.
+ start_states: Vec<StatePtr>,
+ /// Stack scratch space used to follow epsilon transitions in the NFA.
+ /// (This permits us to avoid recursion.)
+ ///
+ /// The maximum stack size is the number of NFA states.
+ stack: Vec<InstPtr>,
+ /// The total number of times this cache has been flushed by the DFA
+ /// because of space constraints.
+ flush_count: u64,
+ /// The total heap size of the DFA's cache. We use this to determine when
+ /// we should flush the cache.
+ size: usize,
+ /// Scratch space used when building instruction pointer lists for new
+ /// states. This helps amortize allocation.
+ insts_scratch_space: Vec<u8>,
+}
+
+/// The transition table.
+///
+/// It is laid out in row-major order, with states as rows and byte class
+/// transitions as columns.
+///
+/// The transition table is responsible for producing valid `StatePtrs`. A
+/// `StatePtr` points to the start of a particular row in this table. When
+/// indexing to find the next state this allows us to avoid a multiplication
+/// when computing an index into the table.
+#[derive(Clone)]
+struct Transitions {
+ /// The table.
+ table: Vec<StatePtr>,
+ /// The stride.
+ num_byte_classes: usize,
+}
+
+/// Fsm encapsulates the actual execution of the DFA.
+#[derive(Debug)]
+pub struct Fsm<'a> {
+ /// prog contains the NFA instruction opcodes. DFA execution uses either
+ /// the `dfa` instructions or the `dfa_reverse` instructions from
+ /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
+ /// Unicode opcodes that cannot be executed by the DFA.)
+ prog: &'a Program,
+ /// The start state. We record it here because the pointer may change
+ /// when the cache is wiped.
+ start: StatePtr,
+ /// The current position in the input.
+ at: usize,
+ /// Should we quit after seeing the first match? e.g., When the caller
+ /// uses `is_match` or `shortest_match`.
+ quit_after_match: bool,
+ /// The last state that matched.
+ ///
+ /// When no match has occurred, this is set to STATE_UNKNOWN.
+ ///
+ /// This is only useful when matching regex sets. The last match state
+ /// is useful because it contains all of the match instructions seen,
+ /// thereby allowing us to enumerate which regexes in the set matched.
+ last_match_si: StatePtr,
+ /// The input position of the last cache flush. We use this to determine
+ /// if we're thrashing in the cache too often. If so, the DFA quits so
+ /// that we can fall back to the NFA algorithm.
+ last_cache_flush: usize,
+ /// All cached DFA information that is persisted between searches.
+ cache: &'a mut CacheInner,
+}
+
+/// The result of running the DFA.
+///
+/// Generally, the result is either a match or not a match, but sometimes the
+/// DFA runs too slowly because the cache size is too small. In that case, it
+/// gives up with the intent of falling back to the NFA algorithm.
+///
+/// The DFA can also give up if it runs out of room to create new states, or if
+/// it sees non-ASCII bytes in the presence of a Unicode word boundary.
+#[derive(Clone, Debug)]
+pub enum Result<T> {
+ Match(T),
+ NoMatch(usize),
+ Quit,
+}
+
+impl<T> Result<T> {
+ /// Returns true if this result corresponds to a match.
+ pub fn is_match(&self) -> bool {
+ match *self {
+ Result::Match(_) => true,
+ Result::NoMatch(_) | Result::Quit => false,
+ }
+ }
+
+ /// Maps the given function onto T and returns the result.
+ ///
+ /// If this isn't a match, then this is a no-op.
+ #[cfg(feature = "perf-literal")]
+ pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
+ match self {
+ Result::Match(t) => Result::Match(f(t)),
+ Result::NoMatch(x) => Result::NoMatch(x),
+ Result::Quit => Result::Quit,
+ }
+ }
+
+ /// Sets the non-match position.
+ ///
+ /// If this isn't a non-match, then this is a no-op.
+ fn set_non_match(self, at: usize) -> Result<T> {
+ match self {
+ Result::NoMatch(_) => Result::NoMatch(at),
+ r => r,
+ }
+ }
+}
+
+/// `State` is a DFA state. It contains an ordered set of NFA states (not
+/// necessarily complete) and a smattering of flags.
+///
+/// The flags are packed into the first byte of data.
+///
+/// States don't carry their transitions. Instead, transitions are stored in
+/// a single row-major table.
+///
+/// Delta encoding is used to store the instruction pointers.
+/// The first instruction pointer is stored directly starting
+/// at data[1], and each following pointer is stored as an offset
+/// to the previous one. If a delta is in the range -127..127,
+/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
+/// is coded as a flag, followed by 4 bytes encoding the delta.
+#[derive(Clone, Eq, Hash, PartialEq)]
+struct State {
+ data: Arc<[u8]>,
+}
+
+/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
+/// an NFA state).
+///
+/// Throughout this library, this is usually set to `usize`, but we force a
+/// `u32` here for the DFA to save on space.
+type InstPtr = u32;
+
+/// Adds ip to data using delta encoding with respect to prev.
+///
+/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
+fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
+ let delta = (ip as i32) - (*prev as i32);
+ write_vari32(data, delta);
+ *prev = ip;
+}
+
+struct InstPtrs<'a> {
+ base: usize,
+ data: &'a [u8],
+}
+
+impl<'a> Iterator for InstPtrs<'a> {
+ type Item = usize;
+
+ fn next(&mut self) -> Option<usize> {
+ if self.data.is_empty() {
+ return None;
+ }
+ let (delta, nread) = read_vari32(self.data);
+ let base = self.base as i32 + delta;
+ debug_assert!(base >= 0);
+ debug_assert!(nread > 0);
+ self.data = &self.data[nread..];
+ self.base = base as usize;
+ Some(self.base)
+ }
+}
+
+impl State {
+ fn flags(&self) -> StateFlags {
+ StateFlags(self.data[0])
+ }
+
+ fn inst_ptrs(&self) -> InstPtrs {
+ InstPtrs { base: 0, data: &self.data[1..] }
+ }
+}
+
+/// `StatePtr` is a 32 bit pointer to the start of a row in the transition
+/// table.
+///
+/// It has many special values. There are two types of special values:
+/// sentinels and flags.
+///
+/// Sentinels corresponds to special states that carry some kind of
+/// significance. There are three such states: unknown, dead and quit states.
+///
+/// Unknown states are states that haven't been computed yet. They indicate
+/// that a transition should be filled in that points to either an existing
+/// cached state or a new state altogether. In general, an unknown state means
+/// "follow the NFA's epsilon transitions."
+///
+/// Dead states are states that can never lead to a match, no matter what
+/// subsequent input is observed. This means that the DFA should quit
+/// immediately and return the longest match it has found thus far.
+///
+/// Quit states are states that imply the DFA is not capable of matching the
+/// regex correctly. Currently, this is only used when a Unicode word boundary
+/// exists in the regex *and* a non-ASCII byte is observed.
+///
+/// The other type of state pointer is a state pointer with special flag bits.
+/// There are two flags: a start flag and a match flag. The lower bits of both
+/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX`
+/// mask).
+///
+/// The start flag means that the state is a start state, and therefore may be
+/// subject to special prefix scanning optimizations.
+///
+/// The match flag means that the state is a match state, and therefore the
+/// current position in the input (while searching) should be recorded.
+///
+/// The above exists mostly in the service of making the inner loop fast.
+/// In particular, the inner *inner* loop looks something like this:
+///
+/// ```ignore
+/// while state <= STATE_MAX and i < len(text):
+/// state = state.next[i]
+/// ```
+///
+/// This is nice because it lets us execute a lazy DFA as if it were an
+/// entirely offline DFA (i.e., with very few instructions). The loop will
+/// quit only when we need to examine a case that needs special attention.
+type StatePtr = u32;
+
+/// An unknown state means that the state has not been computed yet, and that
+/// the only way to progress is to compute it.
+const STATE_UNKNOWN: StatePtr = 1 << 31;
+
+/// A dead state means that the state has been computed and it is known that
+/// once it is entered, no future match can ever occur.
+const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
+
+/// A quit state means that the DFA came across some input that it doesn't
+/// know how to process correctly. The DFA should quit and another matching
+/// engine should be run in its place.
+const STATE_QUIT: StatePtr = STATE_DEAD + 1;
+
+/// A start state is a state that the DFA can start in.
+///
+/// Note that start states have their lower bits set to a state pointer.
+const STATE_START: StatePtr = 1 << 30;
+
+/// A match state means that the regex has successfully matched.
+///
+/// Note that match states have their lower bits set to a state pointer.
+const STATE_MATCH: StatePtr = 1 << 29;
+
+/// The maximum state pointer. This is useful to mask out the "valid" state
+/// pointer from a state with the "start" or "match" bits set.
+///
+/// It doesn't make sense to use this with unknown, dead or quit state
+/// pointers, since those pointers are sentinels and never have their lower
+/// bits set to anything meaningful.
+const STATE_MAX: StatePtr = STATE_MATCH - 1;
+
+/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the
+/// special EOF sentinel value.
+#[derive(Copy, Clone, Debug)]
+struct Byte(u16);
+
+/// A set of flags for zero-width assertions.
+#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
+struct EmptyFlags {
+ start: bool,
+ end: bool,
+ start_line: bool,
+ end_line: bool,
+ word_boundary: bool,
+ not_word_boundary: bool,
+}
+
+/// A set of flags describing various configurations of a DFA state. This is
+/// represented by a `u8` so that it is compact.
+#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
+struct StateFlags(u8);
+
+impl Cache {
+ /// Create new empty cache for the DFA engine.
+ pub fn new(prog: &Program) -> Self {
+ // We add 1 to account for the special EOF byte.
+ let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
+ let starts = vec![STATE_UNKNOWN; 256];
+ let mut cache = Cache {
+ inner: CacheInner {
+ compiled: StateMap::new(num_byte_classes),
+ trans: Transitions::new(num_byte_classes),
+ start_states: starts,
+ stack: vec![],
+ flush_count: 0,
+ size: 0,
+ insts_scratch_space: vec![],
+ },
+ qcur: SparseSet::new(prog.insts.len()),
+ qnext: SparseSet::new(prog.insts.len()),
+ };
+ cache.inner.reset_size();
+ cache
+ }
+}
+
+impl CacheInner {
+ /// Resets the cache size to account for fixed costs, such as the program
+ /// and stack sizes.
+ fn reset_size(&mut self) {
+ self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
+ + (self.stack.len() * mem::size_of::<InstPtr>());
+ }
+}
+
+impl<'a> Fsm<'a> {
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn forward(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ quit_after_match: bool,
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa;
+ let mut dfa = Fsm {
+ prog: prog,
+ start: 0, // filled in below
+ at: at,
+ quit_after_match: quit_after_match,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn reverse(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ quit_after_match: bool,
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa_reverse;
+ let mut dfa = Fsm {
+ prog: prog,
+ start: 0, // filled in below
+ at: at,
+ quit_after_match: quit_after_match,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn forward_many(
+ prog: &'a Program,
+ cache: &ProgramCache,
+ matches: &mut [bool],
+ text: &[u8],
+ at: usize,
+ ) -> Result<usize> {
+ debug_assert!(matches.len() == prog.matches.len());
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.dfa;
+ let mut dfa = Fsm {
+ prog: prog,
+ start: 0, // filled in below
+ at: at,
+ quit_after_match: false,
+ last_match_si: STATE_UNKNOWN,
+ last_cache_flush: at,
+ cache: &mut cache.inner,
+ };
+ let (empty_flags, state_flags) = dfa.start_flags(text, at);
+ dfa.start =
+ match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return Result::NoMatch(at),
+ Some(si) => si,
+ };
+ debug_assert!(dfa.start != STATE_UNKNOWN);
+ let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
+ if result.is_match() {
+ if matches.len() == 1 {
+ matches[0] = true;
+ } else {
+ debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
+ debug_assert!(dfa.last_match_si != STATE_DEAD);
+ for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
+ if let Inst::Match(slot) = dfa.prog[ip] {
+ matches[slot] = true;
+ }
+ }
+ }
+ }
+ result
+ }
+
+ /// Executes the DFA on a forward NFA.
+ ///
+ /// {qcur,qnext} are scratch ordered sets which may be non-empty.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_at(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ text: &[u8],
+ ) -> Result<usize> {
+ // For the most part, the DFA is basically:
+ //
+ // last_match = null
+ // while current_byte != EOF:
+ // si = current_state.next[current_byte]
+ // if si is match
+ // last_match = si
+ // return last_match
+ //
+ // However, we need to deal with a few things:
+ //
+ // 1. This is an *online* DFA, so the current state's next list
+ // may not point to anywhere yet, so we must go out and compute
+ // them. (They are then cached into the current state's next list
+ // to avoid re-computation.)
+ // 2. If we come across a state that is known to be dead (i.e., never
+ // leads to a match), then we can quit early.
+ // 3. If the caller just wants to know if a match occurs, then we
+ // can quit as soon as we know we have a match. (Full leftmost
+ // first semantics require continuing on.)
+ // 4. If we're in the start state, then we can use a pre-computed set
+ // of prefix literals to skip quickly along the input.
+ // 5. After the input is exhausted, we run the DFA on one symbol
+ // that stands for EOF. This is useful for handling empty width
+ // assertions.
+ // 6. We can't actually do state.next[byte]. Instead, we have to do
+ // state.next[byte_classes[byte]], which permits us to keep the
+ // 'next' list very small.
+ //
+ // Since there's a bunch of extra stuff we need to consider, we do some
+ // pretty hairy tricks to get the inner loop to run as fast as
+ // possible.
+ debug_assert!(!self.prog.is_reverse);
+
+ // The last match is the currently known ending match position. It is
+ // reported as an index to the most recent byte that resulted in a
+ // transition to a match state and is always stored in capture slot `1`
+ // when searching forwards. Its maximum value is `text.len()`.
+ let mut result = Result::NoMatch(self.at);
+ let (mut prev_si, mut next_si) = (self.start, self.start);
+ let mut at = self.at;
+ while at < text.len() {
+ // This is the real inner loop. We take advantage of special bits
+ // set in the state pointer to determine whether a state is in the
+ // "common" case or not. Specifically, the common case is a
+ // non-match non-start non-dead state that has already been
+ // computed. So long as we remain in the common case, this inner
+ // loop will chew through the input.
+ //
+ // We also unroll the loop 4 times to amortize the cost of checking
+ // whether we've consumed the entire input. We are also careful
+ // to make sure that `prev_si` always represents the previous state
+ // and `next_si` always represents the next state after the loop
+ // exits, even if it isn't always true inside the loop.
+ while next_si <= STATE_MAX && at < text.len() {
+ // Argument for safety is in the definition of next_si.
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ at += 1;
+ if prev_si > STATE_MAX || at + 2 >= text.len() {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ at += 1;
+ if next_si > STATE_MAX {
+ break;
+ }
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ at += 1;
+ if prev_si > STATE_MAX {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ at += 1;
+ }
+ if next_si & STATE_MATCH > 0 {
+ // A match state is outside of the common case because it needs
+ // special case analysis. In particular, we need to record the
+ // last position as having matched and possibly quit the DFA if
+ // we don't need to keep matching.
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at - 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ prev_si = next_si;
+
+ // This permits short-circuiting when matching a regex set.
+ // In particular, if this DFA state contains only match states,
+ // then it's impossible to extend the set of matches since
+ // match states are final. Therefore, we can quit.
+ if self.prog.matches.len() > 1 {
+ let state = self.state(next_si);
+ let just_matches =
+ state.inst_ptrs().all(|ip| self.prog[ip].is_match());
+ if just_matches {
+ return result;
+ }
+ }
+
+ // Another inner loop! If the DFA stays in this particular
+ // match state, then we can rip through all of the input
+ // very quickly, and only recording the match location once
+ // we've left this particular state.
+ let cur = at;
+ while (next_si & !STATE_MATCH) == prev_si
+ && at + 2 < text.len()
+ {
+ // Argument for safety is in the definition of next_si.
+ next_si = unsafe {
+ self.next_si(next_si & !STATE_MATCH, text, at)
+ };
+ at += 1;
+ }
+ if at > cur {
+ result = Result::Match(at - 2);
+ }
+ } else if next_si & STATE_START > 0 {
+ // A start state isn't in the common case because we may
+ // what to do quick prefix scanning. If the program doesn't
+ // have a detected prefix, then start states are actually
+ // considered common and this case is never reached.
+ debug_assert!(self.has_prefix());
+ next_si &= !STATE_START;
+ prev_si = next_si;
+ at = match self.prefix_at(text, at) {
+ None => return Result::NoMatch(text.len()),
+ Some(i) => i,
+ };
+ } else if next_si >= STATE_UNKNOWN {
+ if next_si == STATE_QUIT {
+ return Result::Quit;
+ }
+ // Finally, this corresponds to the case where the transition
+ // entered a state that can never lead to a match or a state
+ // that hasn't been computed yet. The latter being the "slow"
+ // path.
+ let byte = Byte::byte(text[at - 1]);
+ // We no longer care about the special bits in the state
+ // pointer.
+ prev_si &= STATE_MAX;
+ // Record where we are. This is used to track progress for
+ // determining whether we should quit if we've flushed the
+ // cache too much.
+ self.at = at;
+ next_si = match self.next_state(qcur, qnext, prev_si, byte) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(at),
+ Some(si) => si,
+ };
+ debug_assert!(next_si != STATE_UNKNOWN);
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at - 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ }
+ prev_si = next_si;
+ } else {
+ prev_si = next_si;
+ }
+ }
+
+ // Run the DFA once more on the special EOF senitnel value.
+ // We don't care about the special bits in the state pointer any more,
+ // so get rid of them.
+ prev_si &= STATE_MAX;
+ prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(text.len()),
+ Some(si) => si & !STATE_START,
+ };
+ debug_assert!(prev_si != STATE_UNKNOWN);
+ if prev_si & STATE_MATCH > 0 {
+ prev_si &= !STATE_MATCH;
+ self.last_match_si = prev_si;
+ result = Result::Match(text.len());
+ }
+ result
+ }
+
+ /// Executes the DFA on a reverse NFA.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_at_reverse(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ text: &[u8],
+ ) -> Result<usize> {
+ // The comments in `exec_at` above mostly apply here too. The main
+ // difference is that we move backwards over the input and we look for
+ // the longest possible match instead of the leftmost-first match.
+ //
+ // N.B. The code duplication here is regrettable. Efforts to improve
+ // it without sacrificing performance are welcome. ---AG
+ debug_assert!(self.prog.is_reverse);
+ let mut result = Result::NoMatch(self.at);
+ let (mut prev_si, mut next_si) = (self.start, self.start);
+ let mut at = self.at;
+ while at > 0 {
+ while next_si <= STATE_MAX && at > 0 {
+ // Argument for safety is in the definition of next_si.
+ at -= 1;
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ if prev_si > STATE_MAX || at <= 4 {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ at -= 1;
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ if next_si > STATE_MAX {
+ break;
+ }
+ at -= 1;
+ prev_si = unsafe { self.next_si(next_si, text, at) };
+ if prev_si > STATE_MAX {
+ mem::swap(&mut prev_si, &mut next_si);
+ break;
+ }
+ at -= 1;
+ next_si = unsafe { self.next_si(prev_si, text, at) };
+ }
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at + 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ prev_si = next_si;
+ let cur = at;
+ while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
+ // Argument for safety is in the definition of next_si.
+ at -= 1;
+ next_si = unsafe {
+ self.next_si(next_si & !STATE_MATCH, text, at)
+ };
+ }
+ if at < cur {
+ result = Result::Match(at + 2);
+ }
+ } else if next_si >= STATE_UNKNOWN {
+ if next_si == STATE_QUIT {
+ return Result::Quit;
+ }
+ let byte = Byte::byte(text[at]);
+ prev_si &= STATE_MAX;
+ self.at = at;
+ next_si = match self.next_state(qcur, qnext, prev_si, byte) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(at),
+ Some(si) => si,
+ };
+ debug_assert!(next_si != STATE_UNKNOWN);
+ if next_si & STATE_MATCH > 0 {
+ next_si &= !STATE_MATCH;
+ result = Result::Match(at + 1);
+ if self.quit_after_match {
+ return result;
+ }
+ self.last_match_si = next_si;
+ }
+ prev_si = next_si;
+ } else {
+ prev_si = next_si;
+ }
+ }
+
+ // Run the DFA once more on the special EOF senitnel value.
+ prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
+ None => return Result::Quit,
+ Some(STATE_DEAD) => return result.set_non_match(0),
+ Some(si) => si,
+ };
+ debug_assert!(prev_si != STATE_UNKNOWN);
+ if prev_si & STATE_MATCH > 0 {
+ prev_si &= !STATE_MATCH;
+ self.last_match_si = prev_si;
+ result = Result::Match(0);
+ }
+ result
+ }
+
+ /// next_si transitions to the next state, where the transition input
+ /// corresponds to text[i].
+ ///
+ /// This elides bounds checks, and is therefore unsafe.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
+ // What is the argument for safety here?
+ // We have three unchecked accesses that could possibly violate safety:
+ //
+ // 1. The given byte of input (`text[i]`).
+ // 2. The class of the byte of input (`classes[text[i]]`).
+ // 3. The transition for the class (`trans[si + cls]`).
+ //
+ // (1) is only safe when calling next_si is guarded by
+ // `i < text.len()`.
+ //
+ // (2) is the easiest case to guarantee since `text[i]` is always a
+ // `u8` and `self.prog.byte_classes` always has length `u8::MAX`.
+ // (See `ByteClassSet.byte_classes` in `compile.rs`.)
+ //
+ // (3) is only safe if (1)+(2) are safe. Namely, the transitions
+ // of every state are defined to have length equal to the number of
+ // byte classes in the program. Therefore, a valid class leads to a
+ // valid transition. (All possible transitions are valid lookups, even
+ // if it points to a state that hasn't been computed yet.) (3) also
+ // relies on `si` being correct, but StatePtrs should only ever be
+ // retrieved from the transition table, which ensures they are correct.
+ debug_assert!(i < text.len());
+ let b = *text.get_unchecked(i);
+ debug_assert!((b as usize) < self.prog.byte_classes.len());
+ let cls = *self.prog.byte_classes.get_unchecked(b as usize);
+ self.cache.trans.next_unchecked(si, cls as usize)
+ }
+
+ /// Computes the next state given the current state and the current input
+ /// byte (which may be EOF).
+ ///
+ /// If STATE_DEAD is returned, then there is no valid state transition.
+ /// This implies that no permutation of future input can lead to a match
+ /// state.
+ ///
+ /// STATE_UNKNOWN can never be returned.
+ fn exec_byte(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ mut si: StatePtr,
+ b: Byte,
+ ) -> Option<StatePtr> {
+ use prog::Inst::*;
+
+ // Initialize a queue with the current DFA state's NFA states.
+ qcur.clear();
+ for ip in self.state(si).inst_ptrs() {
+ qcur.insert(ip);
+ }
+
+ // Before inspecting the current byte, we may need to also inspect
+ // whether the position immediately preceding the current byte
+ // satisfies the empty assertions found in the current state.
+ //
+ // We only need to do this step if there are any empty assertions in
+ // the current state.
+ let is_word_last = self.state(si).flags().is_word();
+ let is_word = b.is_ascii_word();
+ if self.state(si).flags().has_empty() {
+ // Compute the flags immediately preceding the current byte.
+ // This means we only care about the "end" or "end line" flags.
+ // (The "start" flags are computed immediately proceding the
+ // current byte and is handled below.)
+ let mut flags = EmptyFlags::default();
+ if b.is_eof() {
+ flags.end = true;
+ flags.end_line = true;
+ } else if b.as_byte().map_or(false, |b| b == b'\n') {
+ flags.end_line = true;
+ }
+ if is_word_last == is_word {
+ flags.not_word_boundary = true;
+ } else {
+ flags.word_boundary = true;
+ }
+ // Now follow epsilon transitions from every NFA state, but make
+ // sure we only follow transitions that satisfy our flags.
+ qnext.clear();
+ for &ip in &*qcur {
+ self.follow_epsilons(usize_to_u32(ip), qnext, flags);
+ }
+ mem::swap(qcur, qnext);
+ }
+
+ // Now we set flags for immediately after the current byte. Since start
+ // states are processed separately, and are the only states that can
+ // have the StartText flag set, we therefore only need to worry about
+ // the StartLine flag here.
+ //
+ // We do also keep track of whether this DFA state contains a NFA state
+ // that is a matching state. This is precisely how we delay the DFA
+ // matching by one byte in order to process the special EOF sentinel
+ // byte. Namely, if this DFA state containing a matching NFA state,
+ // then it is the *next* DFA state that is marked as a match.
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
+ if b.is_ascii_word() {
+ state_flags.set_word();
+ }
+ // Now follow all epsilon transitions again, but only after consuming
+ // the current byte.
+ qnext.clear();
+ for &ip in &*qcur {
+ match self.prog[ip as usize] {
+ // These states never happen in a byte-based program.
+ Char(_) | Ranges(_) => unreachable!(),
+ // These states are handled when following epsilon transitions.
+ Save(_) | Split(_) | EmptyLook(_) => {}
+ Match(_) => {
+ state_flags.set_match();
+ if !self.continue_past_first_match() {
+ break;
+ } else if self.prog.matches.len() > 1
+ && !qnext.contains(ip as usize)
+ {
+ // If we are continuing on to find other matches,
+ // then keep a record of the match states we've seen.
+ qnext.insert(ip);
+ }
+ }
+ Bytes(ref inst) => {
+ if b.as_byte().map_or(false, |b| inst.matches(b)) {
+ self.follow_epsilons(
+ inst.goto as InstPtr,
+ qnext,
+ empty_flags,
+ );
+ }
+ }
+ }
+ }
+
+ let cache = if b.is_eof() && self.prog.matches.len() > 1 {
+ // If we're processing the last byte of the input and we're
+ // matching a regex set, then make the next state contain the
+ // previous states transitions. We do this so that the main
+ // matching loop can extract all of the match instructions.
+ mem::swap(qcur, qnext);
+ // And don't cache this state because it's totally bunk.
+ false
+ } else {
+ true
+ };
+
+ // We've now built up the set of NFA states that ought to comprise the
+ // next DFA state, so try to find it in the cache, and if it doesn't
+ // exist, cache it.
+ //
+ // N.B. We pass `&mut si` here because the cache may clear itself if
+ // it has gotten too full. When that happens, the location of the
+ // current state may change.
+ let mut next =
+ match self.cached_state(qnext, state_flags, Some(&mut si)) {
+ None => return None,
+ Some(next) => next,
+ };
+ if (self.start & !STATE_START) == next {
+ // Start states can never be match states since all matches are
+ // delayed by one byte.
+ debug_assert!(!self.state(next).flags().is_match());
+ next = self.start_ptr(next);
+ }
+ if next <= STATE_MAX && self.state(next).flags().is_match() {
+ next |= STATE_MATCH;
+ }
+ debug_assert!(next != STATE_UNKNOWN);
+ // And now store our state in the current state's next list.
+ if cache {
+ let cls = self.byte_class(b);
+ self.cache.trans.set_next(si, cls, next);
+ }
+ Some(next)
+ }
+
+ /// Follows the epsilon transitions starting at (and including) `ip`. The
+ /// resulting states are inserted into the ordered set `q`.
+ ///
+ /// Conditional epsilon transitions (i.e., empty width assertions) are only
+ /// followed if they are satisfied by the given flags, which should
+ /// represent the flags set at the current location in the input.
+ ///
+ /// If the current location corresponds to the empty string, then only the
+ /// end line and/or end text flags may be set. If the current location
+ /// corresponds to a real byte in the input, then only the start line
+ /// and/or start text flags may be set.
+ ///
+ /// As an exception to the above, when finding the initial state, any of
+ /// the above flags may be set:
+ ///
+ /// If matching starts at the beginning of the input, then start text and
+ /// start line should be set. If the input is empty, then end text and end
+ /// line should also be set.
+ ///
+ /// If matching starts after the beginning of the input, then only start
+ /// line should be set if the preceding byte is `\n`. End line should never
+ /// be set in this case. (Even if the proceding byte is a `\n`, it will
+ /// be handled in a subsequent DFA state.)
+ fn follow_epsilons(
+ &mut self,
+ ip: InstPtr,
+ q: &mut SparseSet,
+ flags: EmptyFlags,
+ ) {
+ use prog::EmptyLook::*;
+ use prog::Inst::*;
+
+ // We need to traverse the NFA to follow epsilon transitions, so avoid
+ // recursion with an explicit stack.
+ self.cache.stack.push(ip);
+ while let Some(mut ip) = self.cache.stack.pop() {
+ // Try to munch through as many states as possible without
+ // pushes/pops to the stack.
+ loop {
+ // Don't visit states we've already added.
+ if q.contains(ip as usize) {
+ break;
+ }
+ q.insert(ip as usize);
+ match self.prog[ip as usize] {
+ Char(_) | Ranges(_) => unreachable!(),
+ Match(_) | Bytes(_) => {
+ break;
+ }
+ EmptyLook(ref inst) => {
+ // Only follow empty assertion states if our flags
+ // satisfy the assertion.
+ match inst.look {
+ StartLine if flags.start_line => {
+ ip = inst.goto as InstPtr;
+ }
+ EndLine if flags.end_line => {
+ ip = inst.goto as InstPtr;
+ }
+ StartText if flags.start => {
+ ip = inst.goto as InstPtr;
+ }
+ EndText if flags.end => {
+ ip = inst.goto as InstPtr;
+ }
+ WordBoundaryAscii if flags.word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ NotWordBoundaryAscii
+ if flags.not_word_boundary =>
+ {
+ ip = inst.goto as InstPtr;
+ }
+ WordBoundary if flags.word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ NotWordBoundary if flags.not_word_boundary => {
+ ip = inst.goto as InstPtr;
+ }
+ StartLine | EndLine | StartText | EndText
+ | WordBoundaryAscii | NotWordBoundaryAscii
+ | WordBoundary | NotWordBoundary => {
+ break;
+ }
+ }
+ }
+ Save(ref inst) => {
+ ip = inst.goto as InstPtr;
+ }
+ Split(ref inst) => {
+ self.cache.stack.push(inst.goto2 as InstPtr);
+ ip = inst.goto1 as InstPtr;
+ }
+ }
+ }
+ }
+ }
+
+ /// Find a previously computed state matching the given set of instructions
+ /// and is_match bool.
+ ///
+ /// The given set of instructions should represent a single state in the
+ /// NFA along with all states reachable without consuming any input.
+ ///
+ /// The is_match bool should be true if and only if the preceding DFA state
+ /// contains an NFA matching state. The cached state produced here will
+ /// then signify a match. (This enables us to delay a match by one byte,
+ /// in order to account for the EOF sentinel byte.)
+ ///
+ /// If the cache is full, then it is wiped before caching a new state.
+ ///
+ /// The current state should be specified if it exists, since it will need
+ /// to be preserved if the cache clears itself. (Start states are
+ /// always saved, so they should not be passed here.) It takes a mutable
+ /// pointer to the index because if the cache is cleared, the state's
+ /// location may change.
+ fn cached_state(
+ &mut self,
+ q: &SparseSet,
+ mut state_flags: StateFlags,
+ current_state: Option<&mut StatePtr>,
+ ) -> Option<StatePtr> {
+ // If we couldn't come up with a non-empty key to represent this state,
+ // then it is dead and can never lead to a match.
+ //
+ // Note that inst_flags represent the set of empty width assertions
+ // in q. We use this as an optimization in exec_byte to determine when
+ // we should follow epsilon transitions at the empty string preceding
+ // the current byte.
+ let key = match self.cached_state_key(q, &mut state_flags) {
+ None => return Some(STATE_DEAD),
+ Some(v) => v,
+ };
+ // In the cache? Cool. Done.
+ if let Some(si) = self.cache.compiled.get_ptr(&key) {
+ return Some(si);
+ }
+ // If the cache has gotten too big, wipe it.
+ if self.approximate_size() > self.prog.dfa_size_limit
+ && !self.clear_cache_and_save(current_state)
+ {
+ // Ooops. DFA is giving up.
+ return None;
+ }
+ // Allocate room for our state and add it.
+ self.add_state(key)
+ }
+
+ /// Produces a key suitable for describing a state in the DFA cache.
+ ///
+ /// The key invariant here is that equivalent keys are produced for any two
+ /// sets of ordered NFA states (and toggling of whether the previous NFA
+ /// states contain a match state) that do not discriminate a match for any
+ /// input.
+ ///
+ /// Specifically, q should be an ordered set of NFA states and is_match
+ /// should be true if and only if the previous NFA states contained a match
+ /// state.
+ fn cached_state_key(
+ &mut self,
+ q: &SparseSet,
+ state_flags: &mut StateFlags,
+ ) -> Option<State> {
+ use prog::Inst::*;
+
+ // We need to build up enough information to recognize pre-built states
+ // in the DFA. Generally speaking, this includes every instruction
+ // except for those which are purely epsilon transitions, e.g., the
+ // Save and Split instructions.
+ //
+ // Empty width assertions are also epsilon transitions, but since they
+ // are conditional, we need to make them part of a state's key in the
+ // cache.
+
+ let mut insts =
+ mem::replace(&mut self.cache.insts_scratch_space, vec![]);
+ insts.clear();
+ // Reserve 1 byte for flags.
+ insts.push(0);
+
+ let mut prev = 0;
+ for &ip in q {
+ let ip = usize_to_u32(ip);
+ match self.prog[ip as usize] {
+ Char(_) | Ranges(_) => unreachable!(),
+ Save(_) | Split(_) => {}
+ Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
+ EmptyLook(_) => {
+ state_flags.set_empty();
+ push_inst_ptr(&mut insts, &mut prev, ip)
+ }
+ Match(_) => {
+ push_inst_ptr(&mut insts, &mut prev, ip);
+ if !self.continue_past_first_match() {
+ break;
+ }
+ }
+ }
+ }
+ // If we couldn't transition to any other instructions and we didn't
+ // see a match when expanding NFA states previously, then this is a
+ // dead state and no amount of additional input can transition out
+ // of this state.
+ let opt_state = if insts.len() == 1 && !state_flags.is_match() {
+ None
+ } else {
+ let StateFlags(f) = *state_flags;
+ insts[0] = f;
+ Some(State { data: Arc::from(&*insts) })
+ };
+ self.cache.insts_scratch_space = insts;
+ opt_state
+ }
+
+ /// Clears the cache, but saves and restores current_state if it is not
+ /// none.
+ ///
+ /// The current state must be provided here in case its location in the
+ /// cache changes.
+ ///
+ /// This returns false if the cache is not cleared and the DFA should
+ /// give up.
+ fn clear_cache_and_save(
+ &mut self,
+ current_state: Option<&mut StatePtr>,
+ ) -> bool {
+ if self.cache.compiled.is_empty() {
+ // Nothing to clear...
+ return true;
+ }
+ match current_state {
+ None => self.clear_cache(),
+ Some(si) => {
+ let cur = self.state(*si).clone();
+ if !self.clear_cache() {
+ return false;
+ }
+ // The unwrap is OK because we just cleared the cache and
+ // therefore know that the next state pointer won't exceed
+ // STATE_MAX.
+ *si = self.restore_state(cur).unwrap();
+ true
+ }
+ }
+ }
+
+ /// Wipes the state cache, but saves and restores the current start state.
+ ///
+ /// This returns false if the cache is not cleared and the DFA should
+ /// give up.
+ fn clear_cache(&mut self) -> bool {
+ // Bail out of the DFA if we're moving too "slowly."
+ // A heuristic from RE2: assume the DFA is too slow if it is processing
+ // 10 or fewer bytes per state.
+ // Additionally, we permit the cache to be flushed a few times before
+ // caling it quits.
+ let nstates = self.cache.compiled.len();
+ if self.cache.flush_count >= 3
+ && self.at >= self.last_cache_flush
+ && (self.at - self.last_cache_flush) <= 10 * nstates
+ {
+ return false;
+ }
+ // Update statistics tracking cache flushes.
+ self.last_cache_flush = self.at;
+ self.cache.flush_count += 1;
+
+ // OK, actually flush the cache.
+ let start = self.state(self.start & !STATE_START).clone();
+ let last_match = if self.last_match_si <= STATE_MAX {
+ Some(self.state(self.last_match_si).clone())
+ } else {
+ None
+ };
+ self.cache.reset_size();
+ self.cache.trans.clear();
+ self.cache.compiled.clear();
+ for s in &mut self.cache.start_states {
+ *s = STATE_UNKNOWN;
+ }
+ // The unwraps are OK because we just cleared the cache and therefore
+ // know that the next state pointer won't exceed STATE_MAX.
+ let start_ptr = self.restore_state(start).unwrap();
+ self.start = self.start_ptr(start_ptr);
+ if let Some(last_match) = last_match {
+ self.last_match_si = self.restore_state(last_match).unwrap();
+ }
+ true
+ }
+
+ /// Restores the given state back into the cache, and returns a pointer
+ /// to it.
+ fn restore_state(&mut self, state: State) -> Option<StatePtr> {
+ // If we've already stored this state, just return a pointer to it.
+ // None will be the wiser.
+ if let Some(si) = self.cache.compiled.get_ptr(&state) {
+ return Some(si);
+ }
+ self.add_state(state)
+ }
+
+ /// Returns the next state given the current state si and current byte
+ /// b. {qcur,qnext} are used as scratch space for storing ordered NFA
+ /// states.
+ ///
+ /// This tries to fetch the next state from the cache, but if that fails,
+ /// it computes the next state, caches it and returns a pointer to it.
+ ///
+ /// The pointer can be to a real state, or it can be STATE_DEAD.
+ /// STATE_UNKNOWN cannot be returned.
+ ///
+ /// None is returned if a new state could not be allocated (i.e., the DFA
+ /// ran out of space and thinks it's running too slowly).
+ fn next_state(
+ &mut self,
+ qcur: &mut SparseSet,
+ qnext: &mut SparseSet,
+ si: StatePtr,
+ b: Byte,
+ ) -> Option<StatePtr> {
+ if si == STATE_DEAD {
+ return Some(STATE_DEAD);
+ }
+ match self.cache.trans.next(si, self.byte_class(b)) {
+ STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
+ STATE_QUIT => None,
+ STATE_DEAD => Some(STATE_DEAD),
+ nsi => Some(nsi),
+ }
+ }
+
+ /// Computes and returns the start state, where searching begins at
+ /// position `at` in `text`. If the state has already been computed,
+ /// then it is pulled from the cache. If the state hasn't been cached,
+ /// then it is computed, cached and a pointer to it is returned.
+ ///
+ /// This may return STATE_DEAD but never STATE_UNKNOWN.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn start_state(
+ &mut self,
+ q: &mut SparseSet,
+ empty_flags: EmptyFlags,
+ state_flags: StateFlags,
+ ) -> Option<StatePtr> {
+ // Compute an index into our cache of start states based on the set
+ // of empty/state flags set at the current position in the input. We
+ // don't use every flag since not all flags matter. For example, since
+ // matches are delayed by one byte, start states can never be match
+ // states.
+ let flagi = {
+ (((empty_flags.start as u8) << 0)
+ | ((empty_flags.end as u8) << 1)
+ | ((empty_flags.start_line as u8) << 2)
+ | ((empty_flags.end_line as u8) << 3)
+ | ((empty_flags.word_boundary as u8) << 4)
+ | ((empty_flags.not_word_boundary as u8) << 5)
+ | ((state_flags.is_word() as u8) << 6)) as usize
+ };
+ match self.cache.start_states[flagi] {
+ STATE_UNKNOWN => {}
+ STATE_DEAD => return Some(STATE_DEAD),
+ si => return Some(si),
+ }
+ q.clear();
+ let start = usize_to_u32(self.prog.start);
+ self.follow_epsilons(start, q, empty_flags);
+ // Start states can never be match states because we delay every match
+ // by one byte. Given an empty string and an empty match, the match
+ // won't actually occur until the DFA processes the special EOF
+ // sentinel byte.
+ let sp = match self.cached_state(q, state_flags, None) {
+ None => return None,
+ Some(sp) => self.start_ptr(sp),
+ };
+ self.cache.start_states[flagi] = sp;
+ Some(sp)
+ }
+
+ /// Computes the set of starting flags for the given position in text.
+ ///
+ /// This should only be used when executing the DFA forwards over the
+ /// input.
+ fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start = at == 0;
+ empty_flags.end = text.is_empty();
+ empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
+ empty_flags.end_line = text.is_empty();
+
+ let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
+ let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
+ if is_word_last {
+ state_flags.set_word();
+ }
+ if is_word == is_word_last {
+ empty_flags.not_word_boundary = true;
+ } else {
+ empty_flags.word_boundary = true;
+ }
+ (empty_flags, state_flags)
+ }
+
+ /// Computes the set of starting flags for the given position in text.
+ ///
+ /// This should only be used when executing the DFA in reverse over the
+ /// input.
+ fn start_flags_reverse(
+ &self,
+ text: &[u8],
+ at: usize,
+ ) -> (EmptyFlags, StateFlags) {
+ let mut empty_flags = EmptyFlags::default();
+ let mut state_flags = StateFlags::default();
+ empty_flags.start = at == text.len();
+ empty_flags.end = text.is_empty();
+ empty_flags.start_line = at == text.len() || text[at] == b'\n';
+ empty_flags.end_line = text.is_empty();
+
+ let is_word_last =
+ at < text.len() && Byte::byte(text[at]).is_ascii_word();
+ let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
+ if is_word_last {
+ state_flags.set_word();
+ }
+ if is_word == is_word_last {
+ empty_flags.not_word_boundary = true;
+ } else {
+ empty_flags.word_boundary = true;
+ }
+ (empty_flags, state_flags)
+ }
+
+ /// Returns a reference to a State given a pointer to it.
+ fn state(&self, si: StatePtr) -> &State {
+ self.cache.compiled.get_state(si).unwrap()
+ }
+
+ /// Adds the given state to the DFA.
+ ///
+ /// This allocates room for transitions out of this state in
+ /// self.cache.trans. The transitions can be set with the returned
+ /// StatePtr.
+ ///
+ /// If None is returned, then the state limit was reached and the DFA
+ /// should quit.
+ fn add_state(&mut self, state: State) -> Option<StatePtr> {
+ // This will fail if the next state pointer exceeds STATE_PTR. In
+ // practice, the cache limit will prevent us from ever getting here,
+ // but maybe callers will set the cache size to something ridiculous...
+ let si = match self.cache.trans.add() {
+ None => return None,
+ Some(si) => si,
+ };
+ // If the program has a Unicode word boundary, then set any transitions
+ // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a
+ // transition, then it will quit and an alternative matching engine
+ // will take over.
+ if self.prog.has_unicode_word_boundary {
+ for b in 128..256 {
+ let cls = self.byte_class(Byte::byte(b as u8));
+ self.cache.trans.set_next(si, cls, STATE_QUIT);
+ }
+ }
+ // Finally, put our actual state on to our heap of states and index it
+ // so we can find it later.
+ self.cache.size += self.cache.trans.state_heap_size()
+ + state.data.len()
+ + (2 * mem::size_of::<State>())
+ + mem::size_of::<StatePtr>();
+ self.cache.compiled.insert(state, si);
+ // Transition table and set of states and map should all be in sync.
+ debug_assert!(
+ self.cache.compiled.len() == self.cache.trans.num_states()
+ );
+ Some(si)
+ }
+
+ /// Quickly finds the next occurrence of any literal prefixes in the regex.
+ /// If there are no literal prefixes, then the current position is
+ /// returned. If there are literal prefixes and one could not be found,
+ /// then None is returned.
+ ///
+ /// This should only be called when the DFA is in a start state.
+ fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
+ self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
+ }
+
+ /// Returns the number of byte classes required to discriminate transitions
+ /// in each state.
+ ///
+ /// invariant: num_byte_classes() == len(State.next)
+ fn num_byte_classes(&self) -> usize {
+ // We add 1 to account for the special EOF byte.
+ (self.prog.byte_classes[255] as usize + 1) + 1
+ }
+
+ /// Given an input byte or the special EOF sentinel, return its
+ /// corresponding byte class.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn byte_class(&self, b: Byte) -> usize {
+ match b.as_byte() {
+ None => self.num_byte_classes() - 1,
+ Some(b) => self.u8_class(b),
+ }
+ }
+
+ /// Like byte_class, but explicitly for u8s.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn u8_class(&self, b: u8) -> usize {
+ self.prog.byte_classes[b as usize] as usize
+ }
+
+ /// Returns true if the DFA should continue searching past the first match.
+ ///
+ /// Leftmost first semantics in the DFA are preserved by not following NFA
+ /// transitions after the first match is seen.
+ ///
+ /// On occasion, we want to avoid leftmost first semantics to find either
+ /// the longest match (for reverse search) or all possible matches (for
+ /// regex sets).
+ fn continue_past_first_match(&self) -> bool {
+ self.prog.is_reverse || self.prog.matches.len() > 1
+ }
+
+ /// Returns true if there is a prefix we can quickly search for.
+ fn has_prefix(&self) -> bool {
+ !self.prog.is_reverse
+ && !self.prog.prefixes.is_empty()
+ && !self.prog.is_anchored_start
+ }
+
+ /// Sets the STATE_START bit in the given state pointer if and only if
+ /// we have a prefix to scan for.
+ ///
+ /// If there's no prefix, then it's a waste to treat the start state
+ /// specially.
+ fn start_ptr(&self, si: StatePtr) -> StatePtr {
+ if self.has_prefix() {
+ si | STATE_START
+ } else {
+ si
+ }
+ }
+
+ /// Approximate size returns the approximate heap space currently used by
+ /// the DFA. It is used to determine whether the DFA's state cache needs to
+ /// be wiped. Namely, it is possible that for certain regexes on certain
+ /// inputs, a new state could be created for every byte of input. (This is
+ /// bad for memory use, so we bound it with a cache.)
+ fn approximate_size(&self) -> usize {
+ self.cache.size + self.prog.approximate_size()
+ }
+}
+
+/// An abstraction for representing a map of states. The map supports two
+/// different ways of state lookup. One is fast constant time access via a
+/// state pointer. The other is a hashmap lookup based on the DFA's
+/// constituent NFA states.
+///
+/// A DFA state internally uses an Arc such that we only need to store the
+/// set of NFA states on the heap once, even though we support looking up
+/// states by two different means. A more natural way to express this might
+/// use raw pointers, but an Arc is safe and effectively achieves the same
+/// thing.
+#[derive(Debug)]
+struct StateMap {
+ /// The keys are not actually static but rely on always pointing to a
+ /// buffer in `states` which will never be moved except when clearing
+ /// the map or on drop, in which case the keys of this map will be
+ /// removed before
+ map: HashMap<State, StatePtr>,
+ /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
+ /// this Vec rather than just a `StatePtr`.
+ states: Vec<State>,
+ /// The number of byte classes in the DFA. Used to index `states`.
+ num_byte_classes: usize,
+}
+
+impl StateMap {
+ fn new(num_byte_classes: usize) -> StateMap {
+ StateMap {
+ map: HashMap::new(),
+ states: vec![],
+ num_byte_classes: num_byte_classes,
+ }
+ }
+
+ fn len(&self) -> usize {
+ self.states.len()
+ }
+
+ fn is_empty(&self) -> bool {
+ self.states.is_empty()
+ }
+
+ fn get_ptr(&self, state: &State) -> Option<StatePtr> {
+ self.map.get(state).cloned()
+ }
+
+ fn get_state(&self, si: StatePtr) -> Option<&State> {
+ self.states.get(si as usize / self.num_byte_classes)
+ }
+
+ fn insert(&mut self, state: State, si: StatePtr) {
+ self.map.insert(state.clone(), si);
+ self.states.push(state);
+ }
+
+ fn clear(&mut self) {
+ self.map.clear();
+ self.states.clear();
+ }
+}
+
+impl Transitions {
+ /// Create a new transition table.
+ ///
+ /// The number of byte classes corresponds to the stride. Every state will
+ /// have `num_byte_classes` slots for transitions.
+ fn new(num_byte_classes: usize) -> Transitions {
+ Transitions { table: vec![], num_byte_classes: num_byte_classes }
+ }
+
+ /// Returns the total number of states currently in this table.
+ fn num_states(&self) -> usize {
+ self.table.len() / self.num_byte_classes
+ }
+
+ /// Allocates room for one additional state and returns a pointer to it.
+ ///
+ /// If there's no more room, None is returned.
+ fn add(&mut self) -> Option<StatePtr> {
+ let si = self.table.len();
+ if si > STATE_MAX as usize {
+ return None;
+ }
+ self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
+ Some(usize_to_u32(si))
+ }
+
+ /// Clears the table of all states.
+ fn clear(&mut self) {
+ self.table.clear();
+ }
+
+ /// Sets the transition from (si, cls) to next.
+ fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
+ self.table[si as usize + cls] = next;
+ }
+
+ /// Returns the transition corresponding to (si, cls).
+ fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
+ self.table[si as usize + cls]
+ }
+
+ /// The heap size, in bytes, of a single state in the transition table.
+ fn state_heap_size(&self) -> usize {
+ self.num_byte_classes * mem::size_of::<StatePtr>()
+ }
+
+ /// Like `next`, but uses unchecked access and is therefore unsafe.
+ unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
+ debug_assert!((si as usize) < self.table.len());
+ debug_assert!(cls < self.num_byte_classes);
+ *self.table.get_unchecked(si as usize + cls)
+ }
+}
+
+impl StateFlags {
+ fn is_match(&self) -> bool {
+ self.0 & 0b0000000_1 > 0
+ }
+
+ fn set_match(&mut self) {
+ self.0 |= 0b0000000_1;
+ }
+
+ fn is_word(&self) -> bool {
+ self.0 & 0b000000_1_0 > 0
+ }
+
+ fn set_word(&mut self) {
+ self.0 |= 0b000000_1_0;
+ }
+
+ fn has_empty(&self) -> bool {
+ self.0 & 0b00000_1_00 > 0
+ }
+
+ fn set_empty(&mut self) {
+ self.0 |= 0b00000_1_00;
+ }
+}
+
+impl Byte {
+ fn byte(b: u8) -> Self {
+ Byte(b as u16)
+ }
+ fn eof() -> Self {
+ Byte(256)
+ }
+ fn is_eof(&self) -> bool {
+ self.0 == 256
+ }
+
+ fn is_ascii_word(&self) -> bool {
+ let b = match self.as_byte() {
+ None => return false,
+ Some(b) => b,
+ };
+ match b {
+ b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true,
+ _ => false,
+ }
+ }
+
+ fn as_byte(&self) -> Option<u8> {
+ if self.is_eof() {
+ None
+ } else {
+ Some(self.0 as u8)
+ }
+ }
+}
+
+impl fmt::Debug for State {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let ips: Vec<usize> = self.inst_ptrs().collect();
+ f.debug_struct("State")
+ .field("flags", &self.flags())
+ .field("insts", &ips)
+ .finish()
+ }
+}
+
+impl fmt::Debug for Transitions {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut fmtd = f.debug_map();
+ for si in 0..self.num_states() {
+ let s = si * self.num_byte_classes;
+ let e = s + self.num_byte_classes;
+ fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
+ }
+ fmtd.finish()
+ }
+}
+
+struct TransitionsRow<'a>(&'a [StatePtr]);
+
+impl<'a> fmt::Debug for TransitionsRow<'a> {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let mut fmtd = f.debug_map();
+ for (b, si) in self.0.iter().enumerate() {
+ match *si {
+ STATE_UNKNOWN => {}
+ STATE_DEAD => {
+ fmtd.entry(&vb(b as usize), &"DEAD");
+ }
+ si => {
+ fmtd.entry(&vb(b as usize), &si.to_string());
+ }
+ }
+ }
+ fmtd.finish()
+ }
+}
+
+impl fmt::Debug for StateFlags {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ f.debug_struct("StateFlags")
+ .field("is_match", &self.is_match())
+ .field("is_word", &self.is_word())
+ .field("has_empty", &self.has_empty())
+ .finish()
+ }
+}
+
+/// Helper function for formatting a byte as a nice-to-read escaped string.
+fn vb(b: usize) -> String {
+ use std::ascii::escape_default;
+
+ if b > ::std::u8::MAX as usize {
+ "EOF".to_owned()
+ } else {
+ let escaped = escape_default(b as u8).collect::<Vec<u8>>();
+ String::from_utf8_lossy(&escaped).into_owned()
+ }
+}
+
+fn usize_to_u32(n: usize) -> u32 {
+ if (n as u64) > (::std::u32::MAX as u64) {
+ panic!("BUG: {} is too big to fit into u32", n)
+ }
+ n as u32
+}
+
+#[allow(dead_code)] // useful for debugging
+fn show_state_ptr(si: StatePtr) -> String {
+ let mut s = format!("{:?}", si & STATE_MAX);
+ if si == STATE_UNKNOWN {
+ s = format!("{} (unknown)", s);
+ }
+ if si == STATE_DEAD {
+ s = format!("{} (dead)", s);
+ }
+ if si == STATE_QUIT {
+ s = format!("{} (quit)", s);
+ }
+ if si & STATE_START > 0 {
+ s = format!("{} (start)", s);
+ }
+ if si & STATE_MATCH > 0 {
+ s = format!("{} (match)", s);
+ }
+ s
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_vari32(data: &mut Vec<u8>, n: i32) {
+ let mut un = (n as u32) << 1;
+ if n < 0 {
+ un = !un;
+ }
+ write_varu32(data, un)
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_vari32(data: &[u8]) -> (i32, usize) {
+ let (un, i) = read_varu32(data);
+ let mut n = (un >> 1) as i32;
+ if un & 1 != 0 {
+ n = !n;
+ }
+ (n, i)
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
+ while n >= 0b1000_0000 {
+ data.push((n as u8) | 0b1000_0000);
+ n >>= 7;
+ }
+ data.push(n as u8);
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_varu32(data: &[u8]) -> (u32, usize) {
+ let mut n: u32 = 0;
+ let mut shift: u32 = 0;
+ for (i, &b) in data.iter().enumerate() {
+ if b < 0b1000_0000 {
+ return (n | ((b as u32) << shift), i + 1);
+ }
+ n |= ((b as u32) & 0b0111_1111) << shift;
+ shift += 7;
+ }
+ (0, 0)
+}
+
+#[cfg(test)]
+mod tests {
+ extern crate rand;
+
+ use super::{
+ push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
+ State, StateFlags,
+ };
+ use quickcheck::{quickcheck, QuickCheck, StdGen};
+ use std::sync::Arc;
+
+ #[test]
+ fn prop_state_encode_decode() {
+ fn p(ips: Vec<u32>, flags: u8) -> bool {
+ let mut data = vec![flags];
+ let mut prev = 0;
+ for &ip in ips.iter() {
+ push_inst_ptr(&mut data, &mut prev, ip);
+ }
+ let state = State { data: Arc::from(&data[..]) };
+
+ let expected: Vec<usize> =
+ ips.into_iter().map(|ip| ip as usize).collect();
+ let got: Vec<usize> = state.inst_ptrs().collect();
+ expected == got && state.flags() == StateFlags(flags)
+ }
+ QuickCheck::new()
+ .gen(StdGen::new(self::rand::thread_rng(), 10_000))
+ .quickcheck(p as fn(Vec<u32>, u8) -> bool);
+ }
+
+ #[test]
+ fn prop_read_write_u32() {
+ fn p(n: u32) -> bool {
+ let mut buf = vec![];
+ write_varu32(&mut buf, n);
+ let (got, nread) = read_varu32(&buf);
+ nread == buf.len() && got == n
+ }
+ quickcheck(p as fn(u32) -> bool);
+ }
+
+ #[test]
+ fn prop_read_write_i32() {
+ fn p(n: i32) -> bool {
+ let mut buf = vec![];
+ write_vari32(&mut buf, n);
+ let (got, nread) = read_vari32(&buf);
+ nread == buf.len() && got == n
+ }
+ quickcheck(p as fn(i32) -> bool);
+ }
+}