From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- vendor/regex-automata/src/determinize.rs | 286 +++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 vendor/regex-automata/src/determinize.rs (limited to 'vendor/regex-automata/src/determinize.rs') diff --git a/vendor/regex-automata/src/determinize.rs b/vendor/regex-automata/src/determinize.rs new file mode 100644 index 000000000..cf0c28585 --- /dev/null +++ b/vendor/regex-automata/src/determinize.rs @@ -0,0 +1,286 @@ +use std::collections::HashMap; +use std::mem; +use std::rc::Rc; + +use dense; +use error::Result; +use nfa::{self, NFA}; +use sparse_set::SparseSet; +use state_id::{dead_id, StateID}; + +type DFARepr = dense::Repr, S>; + +/// A determinizer converts an NFA to a DFA. +/// +/// This determinizer follows the typical powerset construction, where each +/// DFA state is comprised of one or more NFA states. In the worst case, there +/// is one DFA state for every possible combination of NFA states. In practice, +/// this only happens in certain conditions, typically when there are bounded +/// repetitions. +/// +/// The type variable `S` refers to the chosen state identifier representation +/// used for the DFA. +/// +/// The lifetime variable `'a` refers to the lifetime of the NFA being +/// converted to a DFA. +#[derive(Debug)] +pub(crate) struct Determinizer<'a, S: StateID> { + /// The NFA we're converting into a DFA. + nfa: &'a NFA, + /// The DFA we're building. + dfa: DFARepr, + /// Each DFA state being built is defined as an *ordered* set of NFA + /// states, along with a flag indicating whether the state is a match + /// state or not. + /// + /// This is never empty. The first state is always a dummy state such that + /// a state id == 0 corresponds to a dead state. + builder_states: Vec>, + /// A cache of DFA states that already exist and can be easily looked up + /// via ordered sets of NFA states. + cache: HashMap, S>, + /// Scratch space for a stack of NFA states to visit, for depth first + /// visiting without recursion. + stack: Vec, + /// Scratch space for storing an ordered sequence of NFA states, for + /// amortizing allocation. + scratch_nfa_states: Vec, + /// Whether to build a DFA that finds the longest possible match. + longest_match: bool, +} + +/// An intermediate representation for a DFA state during determinization. +#[derive(Debug, Eq, Hash, PartialEq)] +struct State { + /// Whether this state is a match state or not. + is_match: bool, + /// An ordered sequence of NFA states that make up this DFA state. + nfa_states: Vec, +} + +impl<'a, S: StateID> Determinizer<'a, S> { + /// Create a new determinizer for converting the given NFA to a DFA. + pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { + let dead = Rc::new(State::dead()); + let mut cache = HashMap::default(); + cache.insert(dead.clone(), dead_id()); + + Determinizer { + nfa, + dfa: DFARepr::empty().anchored(nfa.is_anchored()), + builder_states: vec![dead], + cache, + stack: vec![], + scratch_nfa_states: vec![], + longest_match: false, + } + } + + /// Instruct the determinizer to use equivalence classes as the transition + /// alphabet instead of all possible byte values. + pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { + let byte_classes = self.nfa.byte_classes().clone(); + self.dfa = DFARepr::empty_with_byte_classes(byte_classes) + .anchored(self.nfa.is_anchored()); + self + } + + /// Instruct the determinizer to build a DFA that recognizes the longest + /// possible match instead of the leftmost first match. This is useful when + /// constructing reverse DFAs for finding the start of a match. + pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { + self.longest_match = yes; + self + } + + /// Build the DFA. If there was a problem constructing the DFA (e.g., if + /// the chosen state identifier representation is too small), then an error + /// is returned. + pub fn build(mut self) -> Result> { + let representative_bytes: Vec = + self.dfa.byte_classes().representatives().collect(); + let mut sparse = self.new_sparse_set(); + let mut uncompiled = vec![self.add_start(&mut sparse)?]; + while let Some(dfa_id) = uncompiled.pop() { + for &b in &representative_bytes { + let (next_dfa_id, is_new) = + self.cached_state(dfa_id, b, &mut sparse)?; + self.dfa.add_transition(dfa_id, b, next_dfa_id); + if is_new { + uncompiled.push(next_dfa_id); + } + } + } + + // At this point, we shuffle the matching states in the final DFA to + // the beginning. This permits a DFA's match loop to detect a match + // condition by merely inspecting the current state's identifier, and + // avoids the need for any additional auxiliary storage. + let is_match: Vec = + self.builder_states.iter().map(|s| s.is_match).collect(); + self.dfa.shuffle_match_states(&is_match); + Ok(self.dfa) + } + + /// Return the identifier for the next DFA state given an existing DFA + /// state and an input byte. If the next DFA state already exists, then + /// return its identifier from the cache. Otherwise, build the state, cache + /// it and return its identifier. + /// + /// The given sparse set is used for scratch space. It must have a capacity + /// equivalent to the total number of NFA states, but its contents are + /// otherwise unspecified. + /// + /// This routine returns a boolean indicating whether a new state was + /// built. If a new state is built, then the caller needs to add it to its + /// frontier of uncompiled DFA states to compute transitions for. + fn cached_state( + &mut self, + dfa_id: S, + b: u8, + sparse: &mut SparseSet, + ) -> Result<(S, bool)> { + sparse.clear(); + // Compute the set of all reachable NFA states, including epsilons. + self.next(dfa_id, b, sparse); + // Build a candidate state and check if it has already been built. + let state = self.new_state(sparse); + if let Some(&cached_id) = self.cache.get(&state) { + // Since we have a cached state, put the constructed state's + // memory back into our scratch space, so that it can be reused. + let _ = + mem::replace(&mut self.scratch_nfa_states, state.nfa_states); + return Ok((cached_id, false)); + } + // Nothing was in the cache, so add this state to the cache. + self.add_state(state).map(|s| (s, true)) + } + + /// Compute the set of all eachable NFA states, including the full epsilon + /// closure, from a DFA state for a single byte of input. + fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) { + next_nfa_states.clear(); + for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { + let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; + match *self.nfa.state(nfa_id) { + nfa::State::Union { .. } + | nfa::State::Fail + | nfa::State::Match => {} + nfa::State::Range { range: ref r } => { + if r.start <= b && b <= r.end { + self.epsilon_closure(r.next, next_nfa_states); + } + } + nfa::State::Sparse { ref ranges } => { + for r in ranges.iter() { + if r.start > b { + break; + } else if r.start <= b && b <= r.end { + self.epsilon_closure(r.next, next_nfa_states); + break; + } + } + } + } + } + } + + /// Compute the epsilon closure for the given NFA state. + fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { + if !self.nfa.state(start).is_epsilon() { + set.insert(start); + return; + } + + self.stack.push(start); + while let Some(mut id) = self.stack.pop() { + loop { + if set.contains(id) { + break; + } + set.insert(id); + match *self.nfa.state(id) { + nfa::State::Range { .. } + | nfa::State::Sparse { .. } + | nfa::State::Fail + | nfa::State::Match => break, + nfa::State::Union { ref alternates } => { + id = match alternates.get(0) { + None => break, + Some(&id) => id, + }; + self.stack.extend(alternates[1..].iter().rev()); + } + } + } + } + } + + /// Compute the initial DFA state and return its identifier. + /// + /// The sparse set given is used for scratch space, and must have capacity + /// equal to the total number of NFA states. Its contents are unspecified. + fn add_start(&mut self, sparse: &mut SparseSet) -> Result { + sparse.clear(); + self.epsilon_closure(self.nfa.start(), sparse); + let state = self.new_state(&sparse); + let id = self.add_state(state)?; + self.dfa.set_start_state(id); + Ok(id) + } + + /// Add the given state to the DFA and make it available in the cache. + /// + /// The state initially has no transitions. That is, it transitions to the + /// dead state for all possible inputs. + fn add_state(&mut self, state: State) -> Result { + let id = self.dfa.add_empty_state()?; + let rstate = Rc::new(state); + self.builder_states.push(rstate.clone()); + self.cache.insert(rstate, id); + Ok(id) + } + + /// Convert the given set of ordered NFA states to a DFA state. + fn new_state(&mut self, set: &SparseSet) -> State { + let mut state = State { + is_match: false, + nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), + }; + state.nfa_states.clear(); + + for &id in set { + match *self.nfa.state(id) { + nfa::State::Range { .. } => { + state.nfa_states.push(id); + } + nfa::State::Sparse { .. } => { + state.nfa_states.push(id); + } + nfa::State::Fail => { + break; + } + nfa::State::Match => { + state.is_match = true; + if !self.longest_match { + break; + } + } + nfa::State::Union { .. } => {} + } + } + state + } + + /// Create a new sparse set with enough capacity to hold all NFA states. + fn new_sparse_set(&self) -> SparseSet { + SparseSet::new(self.nfa.len()) + } +} + +impl State { + /// Create a new empty dead state. + fn dead() -> State { + State { nfa_states: vec![], is_match: false } + } +} -- cgit v1.2.3