summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/src/backtrack.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex/src/backtrack.rs')
-rw-r--r--third_party/rust/regex/src/backtrack.rs282
1 files changed, 282 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/backtrack.rs b/third_party/rust/regex/src/backtrack.rs
new file mode 100644
index 0000000000..4d83856ca0
--- /dev/null
+++ b/third_party/rust/regex/src/backtrack.rs
@@ -0,0 +1,282 @@
+// This is the backtracking matching engine. It has the same exact capability
+// as the full NFA simulation, except it is artificially restricted to small
+// regexes on small inputs because of its memory requirements.
+//
+// In particular, this is a *bounded* backtracking engine. It retains worst
+// case linear time by keeping track of the states that it has visited (using a
+// bitmap). Namely, once a state is visited, it is never visited again. Since a
+// state is keyed by `(instruction index, input index)`, we have that its time
+// complexity is `O(mn)` (i.e., linear in the size of the search text).
+//
+// The backtracking engine can beat out the NFA simulation on small
+// regexes/inputs because it doesn't have to keep track of multiple copies of
+// the capture groups. In benchmarks, the backtracking engine is roughly twice
+// as fast as the full NFA simulation. Note though that its performance doesn't
+// scale, even if you're willing to live with the memory requirements. Namely,
+// the bitset has to be zeroed on each execution, which becomes quite expensive
+// on large bitsets.
+
+use crate::exec::ProgramCache;
+use crate::input::{Input, InputAt};
+use crate::prog::{InstPtr, Program};
+use crate::re_trait::Slot;
+
+type Bits = u32;
+
+const BIT_SIZE: usize = 32;
+const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB
+
+/// Returns true iff the given regex and input should be executed by this
+/// engine with reasonable memory usage.
+pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
+ // Total memory usage in bytes is determined by:
+ //
+ // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32))
+ //
+ // The actual limit picked is pretty much a heuristic.
+ // See: https://github.com/rust-lang/regex/issues/215
+ let size = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4;
+ size <= MAX_SIZE_BYTES
+}
+
+/// A backtracking matching engine.
+#[derive(Debug)]
+pub struct Bounded<'a, 'm, 'r, 's, I> {
+ prog: &'r Program,
+ input: I,
+ matches: &'m mut [bool],
+ slots: &'s mut [Slot],
+ m: &'a mut Cache,
+}
+
+/// Shared cached state between multiple invocations of a backtracking engine
+/// in the same thread.
+#[derive(Clone, Debug)]
+pub struct Cache {
+ jobs: Vec<Job>,
+ visited: Vec<Bits>,
+}
+
+impl Cache {
+ /// Create new empty cache for the backtracking engine.
+ pub fn new(_prog: &Program) -> Self {
+ Cache { jobs: vec![], visited: vec![] }
+ }
+}
+
+/// A job is an explicit unit of stack space in the backtracking engine.
+///
+/// The "normal" representation is a single state transition, which corresponds
+/// to an NFA state and a character in the input. However, the backtracking
+/// engine must keep track of old capture group values. We use the explicit
+/// stack to do it.
+#[derive(Clone, Copy, Debug)]
+enum Job {
+ Inst { ip: InstPtr, at: InputAt },
+ SaveRestore { slot: usize, old_pos: Option<usize> },
+}
+
+impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
+ /// Execute the backtracking matching engine.
+ ///
+ /// If there's a match, `exec` returns `true` and populates the given
+ /// captures accordingly.
+ pub fn exec(
+ prog: &'r Program,
+ cache: &ProgramCache,
+ matches: &'m mut [bool],
+ slots: &'s mut [Slot],
+ input: I,
+ start: usize,
+ end: usize,
+ ) -> bool {
+ let mut cache = cache.borrow_mut();
+ let cache = &mut cache.backtrack;
+ let start = input.at(start);
+ let mut b = Bounded { prog, input, matches, slots, m: cache };
+ b.exec_(start, end)
+ }
+
+ /// Clears the cache such that the backtracking engine can be executed
+ /// on some input of fixed length.
+ fn clear(&mut self) {
+ // Reset the job memory so that we start fresh.
+ self.m.jobs.clear();
+
+ // Now we need to clear the bit state set.
+ // We do this by figuring out how much space we need to keep track
+ // of the states we've visited.
+ // Then we reset all existing allocated space to 0.
+ // Finally, we request more space if we need it.
+ //
+ // This is all a little circuitous, but doing this using unchecked
+ // operations doesn't seem to have a measurable impact on performance.
+ // (Probably because backtracking is limited to such small
+ // inputs/regexes in the first place.)
+ let visited_len =
+ (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1)
+ / BIT_SIZE;
+ self.m.visited.truncate(visited_len);
+ for v in &mut self.m.visited {
+ *v = 0;
+ }
+ if visited_len > self.m.visited.len() {
+ let len = self.m.visited.len();
+ self.m.visited.reserve_exact(visited_len - len);
+ for _ in 0..(visited_len - len) {
+ self.m.visited.push(0);
+ }
+ }
+ }
+
+ /// Start backtracking at the given position in the input, but also look
+ /// for literal prefixes.
+ fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
+ self.clear();
+ // If this is an anchored regex at the beginning of the input, then
+ // we're either already done or we only need to try backtracking once.
+ if self.prog.is_anchored_start {
+ return if !at.is_start() { false } else { self.backtrack(at) };
+ }
+ let mut matched = false;
+ loop {
+ if !self.prog.prefixes.is_empty() {
+ at = match self.input.prefix_at(&self.prog.prefixes, at) {
+ None => break,
+ Some(at) => at,
+ };
+ }
+ matched = self.backtrack(at) || matched;
+ if matched && self.prog.matches.len() == 1 {
+ return true;
+ }
+ if at.pos() >= end {
+ break;
+ }
+ at = self.input.at(at.next_pos());
+ }
+ matched
+ }
+
+ /// The main backtracking loop starting at the given input position.
+ fn backtrack(&mut self, start: InputAt) -> bool {
+ // N.B. We use an explicit stack to avoid recursion.
+ // To avoid excessive pushing and popping, most transitions are handled
+ // in the `step` helper function, which only pushes to the stack when
+ // there's a capture or a branch.
+ let mut matched = false;
+ self.m.jobs.push(Job::Inst { ip: 0, at: start });
+ while let Some(job) = self.m.jobs.pop() {
+ match job {
+ Job::Inst { ip, at } => {
+ if self.step(ip, at) {
+ // Only quit if we're matching one regex.
+ // If we're matching a regex set, then mush on and
+ // try to find other matches (if we want them).
+ if self.prog.matches.len() == 1 {
+ return true;
+ }
+ matched = true;
+ }
+ }
+ Job::SaveRestore { slot, old_pos } => {
+ if slot < self.slots.len() {
+ self.slots[slot] = old_pos;
+ }
+ }
+ }
+ }
+ matched
+ }
+
+ fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
+ use crate::prog::Inst::*;
+ loop {
+ // This loop is an optimization to avoid constantly pushing/popping
+ // from the stack. Namely, if we're pushing a job only to run it
+ // next, avoid the push and just mutate `ip` (and possibly `at`)
+ // in place.
+ if self.has_visited(ip, at) {
+ return false;
+ }
+ match self.prog[ip] {
+ Match(slot) => {
+ if slot < self.matches.len() {
+ self.matches[slot] = true;
+ }
+ return true;
+ }
+ Save(ref inst) => {
+ if let Some(&old_pos) = self.slots.get(inst.slot) {
+ // If this path doesn't work out, then we save the old
+ // capture index (if one exists) in an alternate
+ // job. If the next path fails, then the alternate
+ // job is popped and the old capture index is restored.
+ self.m.jobs.push(Job::SaveRestore {
+ slot: inst.slot,
+ old_pos,
+ });
+ self.slots[inst.slot] = Some(at.pos());
+ }
+ ip = inst.goto;
+ }
+ Split(ref inst) => {
+ self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
+ ip = inst.goto1;
+ }
+ EmptyLook(ref inst) => {
+ if self.input.is_empty_match(at, inst) {
+ ip = inst.goto;
+ } else {
+ return false;
+ }
+ }
+ Char(ref inst) => {
+ if inst.c == at.char() {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ } else {
+ return false;
+ }
+ }
+ Ranges(ref inst) => {
+ if inst.matches(at.char()) {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ } else {
+ return false;
+ }
+ }
+ Bytes(ref inst) => {
+ if let Some(b) = at.byte() {
+ if inst.matches(b) {
+ ip = inst.goto;
+ at = self.input.at(at.next_pos());
+ continue;
+ }
+ }
+ return false;
+ }
+ }
+ }
+ }
+
+ fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
+ let k = ip * (self.input.len() + 1) + at.pos();
+ let k1 = k / BIT_SIZE;
+ let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1)));
+ if self.m.visited[k1] & k2 == 0 {
+ self.m.visited[k1] |= k2;
+ false
+ } else {
+ true
+ }
+ }
+}
+
+fn usize_to_u32(n: usize) -> u32 {
+ if (n as u64) > (::std::u32::MAX as u64) {
+ panic!("BUG: {} is too big to fit into u32", n)
+ }
+ n as u32
+}