// This module implements the Pike VM. That is, it guarantees linear time // search of a regex on any text with memory use proportional to the size of // the regex. // // It is equal in power to the backtracking engine in this crate, except the // backtracking engine is typically faster on small regexes/texts at the // expense of a bigger memory footprint. // // It can do more than the DFA can (specifically, record capture locations // and execute Unicode word boundary assertions), but at a slower speed. // Specifically, the Pike VM exectues a DFA implicitly by repeatedly expanding // epsilon transitions. That is, the Pike VM engine can be in multiple states // at once where as the DFA is only ever in one state at a time. // // Therefore, the Pike VM is generally treated as the fallback when the other // matching engines either aren't feasible to run or are insufficient. use std::mem; use exec::ProgramCache; use input::{Input, InputAt}; use prog::{InstPtr, Program}; use re_trait::Slot; use sparse::SparseSet; /// An NFA simulation matching engine. #[derive(Debug)] pub struct Fsm<'r, I> { /// The sequence of opcodes (among other things) that is actually executed. /// /// The program may be byte oriented or Unicode codepoint oriented. prog: &'r Program, /// An explicit stack used for following epsilon transitions. (This is /// borrowed from the cache.) stack: &'r mut Vec, /// The input to search. input: I, } /// A cached allocation that can be reused on each execution. #[derive(Clone, Debug)] pub struct Cache { /// A pair of ordered sets for tracking NFA states. clist: Threads, nlist: Threads, /// An explicit stack used for following epsilon transitions. stack: Vec, } /// An ordered set of NFA states and their captures. #[derive(Clone, Debug)] struct Threads { /// An ordered set of opcodes (each opcode is an NFA state). set: SparseSet, /// Captures for every NFA state. /// /// It is stored in row-major order, where the columns are the capture /// slots and the rows are the states. caps: Vec, /// The number of capture slots stored per thread. (Every capture has /// two slots.) slots_per_thread: usize, } /// A representation of an explicit stack frame when following epsilon /// transitions. This is used to avoid recursion. #[derive(Clone, Debug)] enum FollowEpsilon { /// Follow transitions at the given instruction pointer. IP(InstPtr), /// Restore the capture slot with the given position in the input. Capture { slot: usize, pos: Slot }, } impl Cache { /// Create a new allocation used by the NFA machine to record execution /// and captures. pub fn new(_prog: &Program) -> Self { Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } } } impl<'r, I: Input> Fsm<'r, I> { /// Execute the NFA matching engine. /// /// If there's a match, `exec` returns `true` and populates the given /// captures accordingly. pub fn exec( prog: &'r Program, cache: &ProgramCache, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, input: I, start: usize, end: usize, ) -> bool { let mut cache = cache.borrow_mut(); let cache = &mut cache.pikevm; cache.clist.resize(prog.len(), prog.captures.len()); cache.nlist.resize(prog.len(), prog.captures.len()); let at = input.at(start); Fsm { prog: prog, stack: &mut cache.stack, input: input }.exec_( &mut cache.clist, &mut cache.nlist, matches, slots, quit_after_match, at, end, ) } fn exec_( &mut self, mut clist: &mut Threads, mut nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], quit_after_match: bool, mut at: InputAt, end: usize, ) -> bool { let mut matched = false; let mut all_matched = false; clist.set.clear(); nlist.set.clear(); 'LOOP: loop { if clist.set.is_empty() { // Three ways to bail out when our current set of threads is // empty. // // 1. We have a match---so we're done exploring any possible // alternatives. Time to quit. (We can't do this if we're // looking for matches for multiple regexes, unless we know // they all matched.) // // 2. If the expression starts with a '^' we can terminate as // soon as the last thread dies. if (matched && matches.len() <= 1) || all_matched || (!at.is_start() && self.prog.is_anchored_start) { break; } // 3. If there's a literal prefix for the program, try to // jump ahead quickly. If it can't be found, then we can // bail out early. if !self.prog.prefixes.is_empty() { at = match self.input.prefix_at(&self.prog.prefixes, at) { None => break, Some(at) => at, }; } } // This simulates a preceding '.*?' for every regex by adding // a state starting at the current position in the input for the // beginning of the program only if we don't already have a match. if clist.set.is_empty() || (!self.prog.is_anchored_start && !all_matched) { self.add(&mut clist, slots, 0, at); } // The previous call to "add" actually inspects the position just // before the current character. For stepping through the machine, // we can to look at the current character, so we advance the // input. let at_next = self.input.at(at.next_pos()); for i in 0..clist.set.len() { let ip = clist.set[i]; if self.step( &mut nlist, matches, slots, clist.caps(ip), ip, at, at_next, ) { matched = true; all_matched = all_matched || matches.iter().all(|&b| b); if quit_after_match { // If we only care if a match occurs (not its // position), then we can quit right now. break 'LOOP; } if self.prog.matches.len() == 1 { // We don't need to check the rest of the threads // in this set because we've matched something // ("leftmost-first"). However, we still need to check // threads in the next set to support things like // greedy matching. // // This is only true on normal regexes. For regex sets, // we need to mush on to observe other matches. break; } } } if at.pos() >= end { break; } at = at_next; mem::swap(clist, nlist); nlist.set.clear(); } matched } /// Step through the input, one token (byte or codepoint) at a time. /// /// nlist is the set of states that will be processed on the next token /// in the input. /// /// caps is the set of captures passed by the caller of the NFA. They are /// written to only when a match state is visited. /// /// thread_caps is the set of captures set for the current NFA state, ip. /// /// at and at_next are the current and next positions in the input. at or /// at_next may be EOF. fn step( &mut self, nlist: &mut Threads, matches: &mut [bool], slots: &mut [Slot], thread_caps: &mut [Option], ip: usize, at: InputAt, at_next: InputAt, ) -> bool { use prog::Inst::*; match self.prog[ip] { Match(match_slot) => { if match_slot < matches.len() { matches[match_slot] = true; } for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { *slot = *val; } true } Char(ref inst) => { if inst.c == at.char() { self.add(nlist, thread_caps, inst.goto, at_next); } false } Ranges(ref inst) => { if inst.matches(at.char()) { self.add(nlist, thread_caps, inst.goto, at_next); } false } Bytes(ref inst) => { if let Some(b) = at.byte() { if inst.matches(b) { self.add(nlist, thread_caps, inst.goto, at_next); } } false } EmptyLook(_) | Save(_) | Split(_) => false, } } /// Follows epsilon transitions and adds them for processing to nlist, /// starting at and including ip. fn add( &mut self, nlist: &mut Threads, thread_caps: &mut [Option], ip: usize, at: InputAt, ) { self.stack.push(FollowEpsilon::IP(ip)); while let Some(frame) = self.stack.pop() { match frame { FollowEpsilon::IP(ip) => { self.add_step(nlist, thread_caps, ip, at); } FollowEpsilon::Capture { slot, pos } => { thread_caps[slot] = pos; } } } } /// A helper function for add that avoids excessive pushing to the stack. fn add_step( &mut self, nlist: &mut Threads, thread_caps: &mut [Option], mut ip: usize, at: InputAt, ) { // Instead of pushing and popping to the stack, we mutate ip as we // traverse the set of states. We only push to the stack when we // absolutely need recursion (restoring captures or following a // branch). use prog::Inst::*; loop { // Don't visit states we've already added. if nlist.set.contains(ip) { return; } nlist.set.insert(ip); match self.prog[ip] { EmptyLook(ref inst) => { if self.input.is_empty_match(at, inst) { ip = inst.goto; } } Save(ref inst) => { if inst.slot < thread_caps.len() { self.stack.push(FollowEpsilon::Capture { slot: inst.slot, pos: thread_caps[inst.slot], }); thread_caps[inst.slot] = Some(at.pos()); } ip = inst.goto; } Split(ref inst) => { self.stack.push(FollowEpsilon::IP(inst.goto2)); ip = inst.goto1; } Match(_) | Char(_) | Ranges(_) | Bytes(_) => { let t = &mut nlist.caps(ip); for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { *slot = *val; } return; } } } } } impl Threads { fn new() -> Self { Threads { set: SparseSet::new(0), caps: vec![], slots_per_thread: 0 } } fn resize(&mut self, num_insts: usize, ncaps: usize) { if num_insts == self.set.capacity() { return; } self.slots_per_thread = ncaps * 2; self.set = SparseSet::new(num_insts); self.caps = vec![None; self.slots_per_thread * num_insts]; } fn caps(&mut self, pc: usize) -> &mut [Option] { let i = pc * self.slots_per_thread; &mut self.caps[i..i + self.slots_per_thread] } }