summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/determinize.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-17 12:02:58 +0000
commit698f8c2f01ea549d77d7dc3338a12e04c11057b9 (patch)
tree173a775858bd501c378080a10dca74132f05bc50 /vendor/regex-automata/src/determinize.rs
parentInitial commit. (diff)
downloadrustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.tar.xz
rustc-698f8c2f01ea549d77d7dc3338a12e04c11057b9.zip
Adding upstream version 1.64.0+dfsg1.upstream/1.64.0+dfsg1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata/src/determinize.rs')
-rw-r--r--vendor/regex-automata/src/determinize.rs286
1 files changed, 286 insertions, 0 deletions
diff --git a/vendor/regex-automata/src/determinize.rs b/vendor/regex-automata/src/determinize.rs
new file mode 100644
index 000000000..cf0c28585
--- /dev/null
+++ b/vendor/regex-automata/src/determinize.rs
@@ -0,0 +1,286 @@
+use std::collections::HashMap;
+use std::mem;
+use std::rc::Rc;
+
+use dense;
+use error::Result;
+use nfa::{self, NFA};
+use sparse_set::SparseSet;
+use state_id::{dead_id, StateID};
+
+type DFARepr<S> = dense::Repr<Vec<S>, S>;
+
+/// A determinizer converts an NFA to a DFA.
+///
+/// This determinizer follows the typical powerset construction, where each
+/// DFA state is comprised of one or more NFA states. In the worst case, there
+/// is one DFA state for every possible combination of NFA states. In practice,
+/// this only happens in certain conditions, typically when there are bounded
+/// repetitions.
+///
+/// The type variable `S` refers to the chosen state identifier representation
+/// used for the DFA.
+///
+/// The lifetime variable `'a` refers to the lifetime of the NFA being
+/// converted to a DFA.
+#[derive(Debug)]
+pub(crate) struct Determinizer<'a, S: StateID> {
+ /// The NFA we're converting into a DFA.
+ nfa: &'a NFA,
+ /// The DFA we're building.
+ dfa: DFARepr<S>,
+ /// Each DFA state being built is defined as an *ordered* set of NFA
+ /// states, along with a flag indicating whether the state is a match
+ /// state or not.
+ ///
+ /// This is never empty. The first state is always a dummy state such that
+ /// a state id == 0 corresponds to a dead state.
+ builder_states: Vec<Rc<State>>,
+ /// A cache of DFA states that already exist and can be easily looked up
+ /// via ordered sets of NFA states.
+ cache: HashMap<Rc<State>, S>,
+ /// Scratch space for a stack of NFA states to visit, for depth first
+ /// visiting without recursion.
+ stack: Vec<nfa::StateID>,
+ /// Scratch space for storing an ordered sequence of NFA states, for
+ /// amortizing allocation.
+ scratch_nfa_states: Vec<nfa::StateID>,
+ /// Whether to build a DFA that finds the longest possible match.
+ longest_match: bool,
+}
+
+/// An intermediate representation for a DFA state during determinization.
+#[derive(Debug, Eq, Hash, PartialEq)]
+struct State {
+ /// Whether this state is a match state or not.
+ is_match: bool,
+ /// An ordered sequence of NFA states that make up this DFA state.
+ nfa_states: Vec<nfa::StateID>,
+}
+
+impl<'a, S: StateID> Determinizer<'a, S> {
+ /// Create a new determinizer for converting the given NFA to a DFA.
+ pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
+ let dead = Rc::new(State::dead());
+ let mut cache = HashMap::default();
+ cache.insert(dead.clone(), dead_id());
+
+ Determinizer {
+ nfa,
+ dfa: DFARepr::empty().anchored(nfa.is_anchored()),
+ builder_states: vec![dead],
+ cache,
+ stack: vec![],
+ scratch_nfa_states: vec![],
+ longest_match: false,
+ }
+ }
+
+ /// Instruct the determinizer to use equivalence classes as the transition
+ /// alphabet instead of all possible byte values.
+ pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
+ let byte_classes = self.nfa.byte_classes().clone();
+ self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
+ .anchored(self.nfa.is_anchored());
+ self
+ }
+
+ /// Instruct the determinizer to build a DFA that recognizes the longest
+ /// possible match instead of the leftmost first match. This is useful when
+ /// constructing reverse DFAs for finding the start of a match.
+ pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
+ self.longest_match = yes;
+ self
+ }
+
+ /// Build the DFA. If there was a problem constructing the DFA (e.g., if
+ /// the chosen state identifier representation is too small), then an error
+ /// is returned.
+ pub fn build(mut self) -> Result<DFARepr<S>> {
+ let representative_bytes: Vec<u8> =
+ self.dfa.byte_classes().representatives().collect();
+ let mut sparse = self.new_sparse_set();
+ let mut uncompiled = vec![self.add_start(&mut sparse)?];
+ while let Some(dfa_id) = uncompiled.pop() {
+ for &b in &representative_bytes {
+ let (next_dfa_id, is_new) =
+ self.cached_state(dfa_id, b, &mut sparse)?;
+ self.dfa.add_transition(dfa_id, b, next_dfa_id);
+ if is_new {
+ uncompiled.push(next_dfa_id);
+ }
+ }
+ }
+
+ // At this point, we shuffle the matching states in the final DFA to
+ // the beginning. This permits a DFA's match loop to detect a match
+ // condition by merely inspecting the current state's identifier, and
+ // avoids the need for any additional auxiliary storage.
+ let is_match: Vec<bool> =
+ self.builder_states.iter().map(|s| s.is_match).collect();
+ self.dfa.shuffle_match_states(&is_match);
+ Ok(self.dfa)
+ }
+
+ /// Return the identifier for the next DFA state given an existing DFA
+ /// state and an input byte. If the next DFA state already exists, then
+ /// return its identifier from the cache. Otherwise, build the state, cache
+ /// it and return its identifier.
+ ///
+ /// The given sparse set is used for scratch space. It must have a capacity
+ /// equivalent to the total number of NFA states, but its contents are
+ /// otherwise unspecified.
+ ///
+ /// This routine returns a boolean indicating whether a new state was
+ /// built. If a new state is built, then the caller needs to add it to its
+ /// frontier of uncompiled DFA states to compute transitions for.
+ fn cached_state(
+ &mut self,
+ dfa_id: S,
+ b: u8,
+ sparse: &mut SparseSet,
+ ) -> Result<(S, bool)> {
+ sparse.clear();
+ // Compute the set of all reachable NFA states, including epsilons.
+ self.next(dfa_id, b, sparse);
+ // Build a candidate state and check if it has already been built.
+ let state = self.new_state(sparse);
+ if let Some(&cached_id) = self.cache.get(&state) {
+ // Since we have a cached state, put the constructed state's
+ // memory back into our scratch space, so that it can be reused.
+ let _ =
+ mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
+ return Ok((cached_id, false));
+ }
+ // Nothing was in the cache, so add this state to the cache.
+ self.add_state(state).map(|s| (s, true))
+ }
+
+ /// Compute the set of all eachable NFA states, including the full epsilon
+ /// closure, from a DFA state for a single byte of input.
+ fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
+ next_nfa_states.clear();
+ for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
+ let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
+ match *self.nfa.state(nfa_id) {
+ nfa::State::Union { .. }
+ | nfa::State::Fail
+ | nfa::State::Match => {}
+ nfa::State::Range { range: ref r } => {
+ if r.start <= b && b <= r.end {
+ self.epsilon_closure(r.next, next_nfa_states);
+ }
+ }
+ nfa::State::Sparse { ref ranges } => {
+ for r in ranges.iter() {
+ if r.start > b {
+ break;
+ } else if r.start <= b && b <= r.end {
+ self.epsilon_closure(r.next, next_nfa_states);
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /// Compute the epsilon closure for the given NFA state.
+ fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
+ if !self.nfa.state(start).is_epsilon() {
+ set.insert(start);
+ return;
+ }
+
+ self.stack.push(start);
+ while let Some(mut id) = self.stack.pop() {
+ loop {
+ if set.contains(id) {
+ break;
+ }
+ set.insert(id);
+ match *self.nfa.state(id) {
+ nfa::State::Range { .. }
+ | nfa::State::Sparse { .. }
+ | nfa::State::Fail
+ | nfa::State::Match => break,
+ nfa::State::Union { ref alternates } => {
+ id = match alternates.get(0) {
+ None => break,
+ Some(&id) => id,
+ };
+ self.stack.extend(alternates[1..].iter().rev());
+ }
+ }
+ }
+ }
+ }
+
+ /// Compute the initial DFA state and return its identifier.
+ ///
+ /// The sparse set given is used for scratch space, and must have capacity
+ /// equal to the total number of NFA states. Its contents are unspecified.
+ fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
+ sparse.clear();
+ self.epsilon_closure(self.nfa.start(), sparse);
+ let state = self.new_state(&sparse);
+ let id = self.add_state(state)?;
+ self.dfa.set_start_state(id);
+ Ok(id)
+ }
+
+ /// Add the given state to the DFA and make it available in the cache.
+ ///
+ /// The state initially has no transitions. That is, it transitions to the
+ /// dead state for all possible inputs.
+ fn add_state(&mut self, state: State) -> Result<S> {
+ let id = self.dfa.add_empty_state()?;
+ let rstate = Rc::new(state);
+ self.builder_states.push(rstate.clone());
+ self.cache.insert(rstate, id);
+ Ok(id)
+ }
+
+ /// Convert the given set of ordered NFA states to a DFA state.
+ fn new_state(&mut self, set: &SparseSet) -> State {
+ let mut state = State {
+ is_match: false,
+ nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
+ };
+ state.nfa_states.clear();
+
+ for &id in set {
+ match *self.nfa.state(id) {
+ nfa::State::Range { .. } => {
+ state.nfa_states.push(id);
+ }
+ nfa::State::Sparse { .. } => {
+ state.nfa_states.push(id);
+ }
+ nfa::State::Fail => {
+ break;
+ }
+ nfa::State::Match => {
+ state.is_match = true;
+ if !self.longest_match {
+ break;
+ }
+ }
+ nfa::State::Union { .. } => {}
+ }
+ }
+ state
+ }
+
+ /// Create a new sparse set with enough capacity to hold all NFA states.
+ fn new_sparse_set(&self) -> SparseSet {
+ SparseSet::new(self.nfa.len())
+ }
+}
+
+impl State {
+ /// Create a new empty dead state.
+ fn dead() -> State {
+ State { nfa_states: vec![], is_match: false }
+ }
+}