// This is the backtracking matching engine. It has the same exact capability // as the full NFA simulation, except it is artificially restricted to small // regexes on small inputs because of its memory requirements. // // In particular, this is a *bounded* backtracking engine. It retains worst // case linear time by keeping track of the states that it has visited (using a // bitmap). Namely, once a state is visited, it is never visited again. Since a // state is keyed by `(instruction index, input index)`, we have that its time // complexity is `O(mn)` (i.e., linear in the size of the search text). // // The backtracking engine can beat out the NFA simulation on small // regexes/inputs because it doesn't have to keep track of multiple copies of // the capture groups. In benchmarks, the backtracking engine is roughly twice // as fast as the full NFA simulation. Note though that its performance doesn't // scale, even if you're willing to live with the memory requirements. Namely, // the bitset has to be zeroed on each execution, which becomes quite expensive // on large bitsets. use crate::exec::ProgramCache; use crate::input::{Input, InputAt}; use crate::prog::{InstPtr, Program}; use crate::re_trait::Slot; type Bits = u32; const BIT_SIZE: usize = 32; const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB /// Returns true iff the given regex and input should be executed by this /// engine with reasonable memory usage. pub fn should_exec(num_insts: usize, text_len: usize) -> bool { // Total memory usage in bytes is determined by: // // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) // // The actual limit picked is pretty much a heuristic. // See: https://github.com/rust-lang/regex/issues/215 let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; size <= MAX_SIZE_BYTES } /// A backtracking matching engine. #[derive(Debug)] pub struct Bounded<'a, 'm, 'r, 's, I> { prog: &'r Program, input: I, matches: &'m mut [bool], slots: &'s mut [Slot], m: &'a mut Cache, } /// Shared cached state between multiple invocations of a backtracking engine /// in the same thread. #[derive(Clone, Debug)] pub struct Cache { jobs: Vec, visited: Vec, } impl Cache { /// Create new empty cache for the backtracking engine. pub fn new(_prog: &Program) -> Self { Cache { jobs: vec![], visited: vec![] } } } /// A job is an explicit unit of stack space in the backtracking engine. /// /// The "normal" representation is a single state transition, which corresponds /// to an NFA state and a character in the input. However, the backtracking /// engine must keep track of old capture group values. We use the explicit /// stack to do it. #[derive(Clone, Copy, Debug)] enum Job { Inst { ip: InstPtr, at: InputAt }, SaveRestore { slot: usize, old_pos: Option }, } impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { /// Execute the backtracking matching engine. /// /// If there's a match, `exec` returns `true` and populates the given /// captures accordingly. pub fn exec( prog: &'r Program, cache: &ProgramCache, matches: &'m mut [bool], slots: &'s mut [Slot], input: I, start: usize, end: usize, ) -> bool { let mut cache = cache.borrow_mut(); let cache = &mut cache.backtrack; let start = input.at(start); let mut b = Bounded { prog: prog, input: input, matches: matches, slots: slots, m: cache, }; b.exec_(start, end) } /// Clears the cache such that the backtracking engine can be executed /// on some input of fixed length. fn clear(&mut self) { // Reset the job memory so that we start fresh. self.m.jobs.clear(); // Now we need to clear the bit state set. // We do this by figuring out how much space we need to keep track // of the states we've visited. // Then we reset all existing allocated space to 0. // Finally, we request more space if we need it. // // This is all a little circuitous, but doing this using unchecked // operations doesn't seem to have a measurable impact on performance. // (Probably because backtracking is limited to such small // inputs/regexes in the first place.) let visited_len = (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) / BIT_SIZE; self.m.visited.truncate(visited_len); for v in &mut self.m.visited { *v = 0; } if visited_len > self.m.visited.len() { let len = self.m.visited.len(); self.m.visited.reserve_exact(visited_len - len); for _ in 0..(visited_len - len) { self.m.visited.push(0); } } } /// Start backtracking at the given position in the input, but also look /// for literal prefixes. fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { self.clear(); // If this is an anchored regex at the beginning of the input, then // we're either already done or we only need to try backtracking once. if self.prog.is_anchored_start { return if !at.is_start() { false } else { self.backtrack(at) }; } let mut matched = false; loop { if !self.prog.prefixes.is_empty() { at = match self.input.prefix_at(&self.prog.prefixes, at) { None => break, Some(at) => at, }; } matched = self.backtrack(at) || matched; if matched && self.prog.matches.len() == 1 { return true; } if at.pos() >= end { break; } at = self.input.at(at.next_pos()); } matched } /// The main backtracking loop starting at the given input position. fn backtrack(&mut self, start: InputAt) -> bool { // N.B. We use an explicit stack to avoid recursion. // To avoid excessive pushing and popping, most transitions are handled // in the `step` helper function, which only pushes to the stack when // there's a capture or a branch. let mut matched = false; self.m.jobs.push(Job::Inst { ip: 0, at: start }); while let Some(job) = self.m.jobs.pop() { match job { Job::Inst { ip, at } => { if self.step(ip, at) { // Only quit if we're matching one regex. // If we're matching a regex set, then mush on and // try to find other matches (if we want them). if self.prog.matches.len() == 1 { return true; } matched = true; } } Job::SaveRestore { slot, old_pos } => { if slot < self.slots.len() { self.slots[slot] = old_pos; } } } } matched } fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { use crate::prog::Inst::*; loop { // This loop is an optimization to avoid constantly pushing/popping // from the stack. Namely, if we're pushing a job only to run it // next, avoid the push and just mutate `ip` (and possibly `at`) // in place. if self.has_visited(ip, at) { return false; } match self.prog[ip] { Match(slot) => { if slot < self.matches.len() { self.matches[slot] = true; } return true; } Save(ref inst) => { if let Some(&old_pos) = self.slots.get(inst.slot) { // If this path doesn't work out, then we save the old // capture index (if one exists) in an alternate // job. If the next path fails, then the alternate // job is popped and the old capture index is restored. self.m.jobs.push(Job::SaveRestore { slot: inst.slot, old_pos: old_pos, }); self.slots[inst.slot] = Some(at.pos()); } ip = inst.goto; } Split(ref inst) => { self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at }); ip = inst.goto1; } EmptyLook(ref inst) => { if self.input.is_empty_match(at, inst) { ip = inst.goto; } else { return false; } } Char(ref inst) => { if inst.c == at.char() { ip = inst.goto; at = self.input.at(at.next_pos()); } else { return false; } } Ranges(ref inst) => { if inst.matches(at.char()) { ip = inst.goto; at = self.input.at(at.next_pos()); } else { return false; } } Bytes(ref inst) => { if let Some(b) = at.byte() { if inst.matches(b) { ip = inst.goto; at = self.input.at(at.next_pos()); continue; } } return false; } } } } fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { let k = ip * (self.input.len() + 1) + at.pos(); let k1 = k / BIT_SIZE; let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); if self.m.visited[k1] & k2 == 0 { self.m.visited[k1] |= k2; false } else { true } } } fn usize_to_u32(n: usize) -> u32 { if (n as u64) > (::std::u32::MAX as u64) { panic!("BUG: {} is too big to fit into u32", n) } n as u32 }