diff options
Diffstat (limited to 'third_party/rust/regex/src/backtrack.rs')
-rw-r--r-- | third_party/rust/regex/src/backtrack.rs | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/backtrack.rs b/third_party/rust/regex/src/backtrack.rs new file mode 100644 index 0000000000..2eaeb72e55 --- /dev/null +++ b/third_party/rust/regex/src/backtrack.rs @@ -0,0 +1,288 @@ +// This is the backtracking matching engine. It has the same exact capability +// as the full NFA simulation, except it is artificially restricted to small +// regexes on small inputs because of its memory requirements. +// +// In particular, this is a *bounded* backtracking engine. It retains worst +// case linear time by keeping track of the states that it has visited (using a +// bitmap). Namely, once a state is visited, it is never visited again. Since a +// state is keyed by `(instruction index, input index)`, we have that its time +// complexity is `O(mn)` (i.e., linear in the size of the search text). +// +// The backtracking engine can beat out the NFA simulation on small +// regexes/inputs because it doesn't have to keep track of multiple copies of +// the capture groups. In benchmarks, the backtracking engine is roughly twice +// as fast as the full NFA simulation. Note though that its performance doesn't +// scale, even if you're willing to live with the memory requirements. Namely, +// the bitset has to be zeroed on each execution, which becomes quite expensive +// on large bitsets. + +use exec::ProgramCache; +use input::{Input, InputAt}; +use prog::{InstPtr, Program}; +use re_trait::Slot; + +type Bits = u32; + +const BIT_SIZE: usize = 32; +const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB + +/// Returns true iff the given regex and input should be executed by this +/// engine with reasonable memory usage. +pub fn should_exec(num_insts: usize, text_len: usize) -> bool { + // Total memory usage in bytes is determined by: + // + // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) + // + // The actual limit picked is pretty much a heuristic. + // See: https://github.com/rust-lang/regex/issues/215 + let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; + size <= MAX_SIZE_BYTES +} + +/// A backtracking matching engine. +#[derive(Debug)] +pub struct Bounded<'a, 'm, 'r, 's, I> { + prog: &'r Program, + input: I, + matches: &'m mut [bool], + slots: &'s mut [Slot], + m: &'a mut Cache, +} + +/// Shared cached state between multiple invocations of a backtracking engine +/// in the same thread. +#[derive(Clone, Debug)] +pub struct Cache { + jobs: Vec<Job>, + visited: Vec<Bits>, +} + +impl Cache { + /// Create new empty cache for the backtracking engine. + pub fn new(_prog: &Program) -> Self { + Cache { jobs: vec![], visited: vec![] } + } +} + +/// A job is an explicit unit of stack space in the backtracking engine. +/// +/// The "normal" representation is a single state transition, which corresponds +/// to an NFA state and a character in the input. However, the backtracking +/// engine must keep track of old capture group values. We use the explicit +/// stack to do it. +#[derive(Clone, Copy, Debug)] +enum Job { + Inst { ip: InstPtr, at: InputAt }, + SaveRestore { slot: usize, old_pos: Option<usize> }, +} + +impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { + /// Execute the backtracking matching engine. + /// + /// If there's a match, `exec` returns `true` and populates the given + /// captures accordingly. + pub fn exec( + prog: &'r Program, + cache: &ProgramCache, + matches: &'m mut [bool], + slots: &'s mut [Slot], + input: I, + start: usize, + end: usize, + ) -> bool { + let mut cache = cache.borrow_mut(); + let cache = &mut cache.backtrack; + let start = input.at(start); + let mut b = Bounded { + prog: prog, + input: input, + matches: matches, + slots: slots, + m: cache, + }; + b.exec_(start, end) + } + + /// Clears the cache such that the backtracking engine can be executed + /// on some input of fixed length. + fn clear(&mut self) { + // Reset the job memory so that we start fresh. + self.m.jobs.clear(); + + // Now we need to clear the bit state set. + // We do this by figuring out how much space we need to keep track + // of the states we've visited. + // Then we reset all existing allocated space to 0. + // Finally, we request more space if we need it. + // + // This is all a little circuitous, but doing this unsafely + // doesn't seem to have a measurable impact on performance. + // (Probably because backtracking is limited to such small + // inputs/regexes in the first place.) + let visited_len = + (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) + / BIT_SIZE; + self.m.visited.truncate(visited_len); + for v in &mut self.m.visited { + *v = 0; + } + if visited_len > self.m.visited.len() { + let len = self.m.visited.len(); + self.m.visited.reserve_exact(visited_len - len); + for _ in 0..(visited_len - len) { + self.m.visited.push(0); + } + } + } + + /// Start backtracking at the given position in the input, but also look + /// for literal prefixes. + fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { + self.clear(); + // If this is an anchored regex at the beginning of the input, then + // we're either already done or we only need to try backtracking once. + if self.prog.is_anchored_start { + return if !at.is_start() { false } else { self.backtrack(at) }; + } + let mut matched = false; + loop { + if !self.prog.prefixes.is_empty() { + at = match self.input.prefix_at(&self.prog.prefixes, at) { + None => break, + Some(at) => at, + }; + } + matched = self.backtrack(at) || matched; + if matched && self.prog.matches.len() == 1 { + return true; + } + if at.pos() >= end { + break; + } + at = self.input.at(at.next_pos()); + } + matched + } + + /// The main backtracking loop starting at the given input position. + fn backtrack(&mut self, start: InputAt) -> bool { + // N.B. We use an explicit stack to avoid recursion. + // To avoid excessive pushing and popping, most transitions are handled + // in the `step` helper function, which only pushes to the stack when + // there's a capture or a branch. + let mut matched = false; + self.m.jobs.push(Job::Inst { ip: 0, at: start }); + while let Some(job) = self.m.jobs.pop() { + match job { + Job::Inst { ip, at } => { + if self.step(ip, at) { + // Only quit if we're matching one regex. + // If we're matching a regex set, then mush on and + // try to find other matches (if we want them). + if self.prog.matches.len() == 1 { + return true; + } + matched = true; + } + } + Job::SaveRestore { slot, old_pos } => { + if slot < self.slots.len() { + self.slots[slot] = old_pos; + } + } + } + } + matched + } + + fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { + use prog::Inst::*; + loop { + // This loop is an optimization to avoid constantly pushing/popping + // from the stack. Namely, if we're pushing a job only to run it + // next, avoid the push and just mutate `ip` (and possibly `at`) + // in place. + if self.has_visited(ip, at) { + return false; + } + match self.prog[ip] { + Match(slot) => { + if slot < self.matches.len() { + self.matches[slot] = true; + } + return true; + } + Save(ref inst) => { + if let Some(&old_pos) = self.slots.get(inst.slot) { + // If this path doesn't work out, then we save the old + // capture index (if one exists) in an alternate + // job. If the next path fails, then the alternate + // job is popped and the old capture index is restored. + self.m.jobs.push(Job::SaveRestore { + slot: inst.slot, + old_pos: old_pos, + }); + self.slots[inst.slot] = Some(at.pos()); + } + ip = inst.goto; + } + Split(ref inst) => { + self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at }); + ip = inst.goto1; + } + EmptyLook(ref inst) => { + if self.input.is_empty_match(at, inst) { + ip = inst.goto; + } else { + return false; + } + } + Char(ref inst) => { + if inst.c == at.char() { + ip = inst.goto; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + ip = inst.goto; + at = self.input.at(at.next_pos()); + } else { + return false; + } + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + ip = inst.goto; + at = self.input.at(at.next_pos()); + continue; + } + } + return false; + } + } + } + } + + fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { + let k = ip * (self.input.len() + 1) + at.pos(); + let k1 = k / BIT_SIZE; + let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); + if self.m.visited[k1] & k2 == 0 { + self.m.visited[k1] |= k2; + false + } else { + true + } + } +} + +fn usize_to_u32(n: usize) -> u32 { + if (n as u64) > (::std::u32::MAX as u64) { + panic!("BUG: {} is too big to fit into u32", n) + } + n as u32 +} |