summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata/src/dfa/determinize.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata/src/dfa/determinize.rs')
-rw-r--r--vendor/regex-automata/src/dfa/determinize.rs182
1 files changed, 117 insertions, 65 deletions
diff --git a/vendor/regex-automata/src/dfa/determinize.rs b/vendor/regex-automata/src/dfa/determinize.rs
index 61603481b..19f99f5d6 100644
--- a/vendor/regex-automata/src/dfa/determinize.rs
+++ b/vendor/regex-automata/src/dfa/determinize.rs
@@ -1,18 +1,18 @@
-use alloc::{
- collections::BTreeMap,
- vec::{self, Vec},
-};
+use alloc::{collections::BTreeMap, vec::Vec};
use crate::{
- dfa::{dense, Error, DEAD},
+ dfa::{
+ dense::{self, BuildError},
+ DEAD,
+ },
nfa::thompson,
util::{
self,
alphabet::{self, ByteSet},
determinize::{State, StateBuilderEmpty, StateBuilderNFA},
- id::{PatternID, StateID},
- matchtypes::MatchKind,
- sparse_set::{SparseSet, SparseSets},
+ primitives::{PatternID, StateID},
+ search::{Anchored, MatchKind},
+ sparse_set::SparseSets,
start::Start,
},
};
@@ -20,7 +20,6 @@ use crate::{
/// A builder for configuring and running a DFA determinizer.
#[derive(Clone, Debug)]
pub(crate) struct Config {
- anchored: bool,
match_kind: MatchKind,
quit: ByteSet,
dfa_size_limit: Option<usize>,
@@ -32,7 +31,6 @@ impl Config {
/// configured before calling `run`.
pub fn new() -> Config {
Config {
- anchored: false,
match_kind: MatchKind::LeftmostFirst,
quit: ByteSet::empty(),
dfa_size_limit: None,
@@ -48,7 +46,7 @@ impl Config {
&self,
nfa: &thompson::NFA,
dfa: &mut dense::OwnedDFA,
- ) -> Result<(), Error> {
+ ) -> Result<(), BuildError> {
let dead = State::dead();
let quit = State::dead();
let mut cache = StateMap::default();
@@ -71,21 +69,13 @@ impl Config {
builder_states: alloc::vec![dead, quit],
cache,
memory_usage_state: 0,
- sparses: SparseSets::new(nfa.len()),
+ sparses: SparseSets::new(nfa.states().len()),
stack: alloc::vec![],
scratch_state_builder: StateBuilderEmpty::new(),
};
runner.run()
}
- /// Whether to build an anchored DFA or not. When disabled (the default),
- /// the unanchored prefix from the NFA is used to start the DFA. Otherwise,
- /// the anchored start state of the NFA is used to start the DFA.
- pub fn anchored(&mut self, yes: bool) -> &mut Config {
- self.anchored = yes;
- self
- }
-
/// The match semantics to use for determinization.
///
/// MatchKind::All corresponds to the standard textbook construction.
@@ -222,20 +212,21 @@ impl<'a> Runner<'a> {
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
/// the chosen state identifier representation is too small), then an error
/// is returned.
- fn run(mut self) -> Result<(), Error> {
- if self.nfa.has_word_boundary_unicode()
+ fn run(mut self) -> Result<(), BuildError> {
+ if self.nfa.look_set_any().contains_word_unicode()
&& !self.config.quit.contains_range(0x80, 0xFF)
{
- return Err(Error::unsupported_dfa_word_boundary_unicode());
+ return Err(BuildError::unsupported_dfa_word_boundary_unicode());
}
// A sequence of "representative" bytes drawn from each equivalence
// class. These representative bytes are fed to the NFA to compute
// state transitions. This allows us to avoid re-computing state
// transitions for bytes that are guaranteed to produce identical
- // results.
+ // results. Since computing the representatives needs to do a little
+ // work, we do it once here because we'll be iterating over them a lot.
let representatives: Vec<alphabet::Unit> =
- self.dfa.byte_classes().representatives().collect();
+ self.dfa.byte_classes().representatives(..).collect();
// The set of all DFA state IDs that still need to have their
// transitions set. We start by seeding this with all starting states.
let mut uncompiled = alloc::vec![];
@@ -259,10 +250,13 @@ impl<'a> Runner<'a> {
}
}
}
- trace!(
- "determinization complete, memory usage: {}, dense DFA size: {}",
+ debug!(
+ "determinization complete, memory usage: {}, \
+ dense DFA size: {}, \
+ is reverse? {}",
self.memory_usage(),
self.dfa.memory_usage(),
+ self.nfa.is_reverse(),
);
// A map from DFA state ID to one or more NFA match IDs. Each NFA match
@@ -270,21 +264,23 @@ impl<'a> Runner<'a> {
// corresponding to the key.
let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
self.cache.clear();
- #[allow(unused_variables)]
- let mut total_pat_count = 0;
+ #[cfg(feature = "logging")]
+ let mut total_pat_len = 0;
for (i, state) in self.builder_states.into_iter().enumerate() {
if let Some(pat_ids) = state.match_pattern_ids() {
- let id = self.dfa.from_index(i);
- total_pat_count += pat_ids.len();
+ let id = self.dfa.to_state_id(i);
+ log! {
+ total_pat_len += pat_ids.len();
+ }
matches.insert(id, pat_ids);
}
}
log! {
use core::mem::size_of;
let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
- let pats = total_pat_count * size_of::<PatternID>();
+ let pats = total_pat_len * size_of::<PatternID>();
let mem = (matches.len() * per_elem) + pats;
- log::trace!("matches map built, memory usage: {}", mem);
+ log::debug!("matches map built, memory usage: {}", mem);
}
// At this point, we shuffle the "special" states in the final DFA.
// This permits a DFA's match loop to detect a match condition (among
@@ -306,7 +302,7 @@ impl<'a> Runner<'a> {
&mut self,
dfa_id: StateID,
unit: alphabet::Unit,
- ) -> Result<(StateID, bool), Error> {
+ ) -> Result<(StateID, bool), BuildError> {
// Compute the set of all reachable NFA states, including epsilons.
let empty_builder = self.get_state_builder();
let builder = util::determinize::next(
@@ -326,15 +322,32 @@ impl<'a> Runner<'a> {
fn add_all_starts(
&mut self,
dfa_state_ids: &mut Vec<StateID>,
- ) -> Result<(), Error> {
- // Always add the (possibly unanchored) start states for matching any
- // of the patterns in this DFA.
- self.add_start_group(None, dfa_state_ids)?;
+ ) -> Result<(), BuildError> {
+ // These should be the first states added.
+ assert!(dfa_state_ids.is_empty());
+ // We only want to add (un)anchored starting states that is consistent
+ // with our DFA's configuration. Unconditionally adding both (although
+ // it is the default) can make DFAs quite a bit bigger.
+ if self.dfa.start_kind().has_unanchored() {
+ self.add_start_group(Anchored::No, dfa_state_ids)?;
+ }
+ if self.dfa.start_kind().has_anchored() {
+ self.add_start_group(Anchored::Yes, dfa_state_ids)?;
+ }
+ // I previously has an 'assert' here checking that either
+ // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
+ // turns out this isn't always true. For example, the NFA might have
+ // one or more patterns but where all such patterns are just 'fail'
+ // states. These will ultimately just compile down to DFA dead states,
+ // and since the dead state was added earlier, no new DFA states are
+ // added. And thus, it is valid and okay for 'dfa_state_ids' to be
+ // empty even if there are a non-zero number of patterns in the NFA.
+
// We only need to compute anchored start states for each pattern if it
// was requested to do so.
- if self.dfa.has_starts_for_each_pattern() {
- for pid in PatternID::iter(self.dfa.pattern_count()) {
- self.add_start_group(Some(pid), dfa_state_ids)?;
+ if self.dfa.starts_for_each_pattern() {
+ for pid in self.nfa.patterns() {
+ self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
}
}
Ok(())
@@ -348,15 +361,19 @@ impl<'a> Runner<'a> {
/// start states (if the DFA is unanchored). When the pattern_id is
/// present, then this will compile a group of anchored start states that
/// only match the given pattern.
+ ///
+ /// This panics if `anchored` corresponds to an invalid pattern ID.
fn add_start_group(
&mut self,
- pattern_id: Option<PatternID>,
+ anchored: Anchored,
dfa_state_ids: &mut Vec<StateID>,
- ) -> Result<(), Error> {
- let nfa_start = match pattern_id {
- Some(pid) => self.nfa.start_pattern(pid),
- None if self.config.anchored => self.nfa.start_anchored(),
- None => self.nfa.start_unanchored(),
+ ) -> Result<(), BuildError> {
+ let nfa_start = match anchored {
+ Anchored::No => self.nfa.start_unanchored(),
+ Anchored::Yes => self.nfa.start_anchored(),
+ Anchored::Pattern(pid) => {
+ self.nfa.start_pattern(pid).expect("valid pattern ID")
+ }
};
// When compiling start states, we're careful not to build additional
@@ -365,36 +382,68 @@ impl<'a> Runner<'a> {
// states for 'NonWordByte' and 'WordByte' starting configurations.
// Instead, the 'WordByte' starting configuration can just point
// directly to the start state for the 'NonWordByte' config.
+ //
+ // Note though that we only need to care about assertions in the prefix
+ // of an NFA since this only concerns the starting states. (Actually,
+ // the most precisely thing we could do it is look at the prefix
+ // assertions of each pattern when 'anchored == Anchored::Pattern',
+ // and then only compile extra states if the prefix is non-empty.) But
+ // we settle for simplicity here instead of absolute minimalism. It is
+ // somewhat rare, after all, for multiple patterns in the same regex to
+ // have different prefix look-arounds.
let (id, is_new) =
self.add_one_start(nfa_start, Start::NonWordByte)?;
- self.dfa.set_start_state(Start::NonWordByte, pattern_id, id);
+ self.dfa.set_start_state(anchored, Start::NonWordByte, id);
if is_new {
dfa_state_ids.push(id);
}
- if !self.nfa.has_word_boundary() {
- self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+ if !self.nfa.look_set_prefix_any().contains_word() {
+ self.dfa.set_start_state(anchored, Start::WordByte, id);
} else {
let (id, is_new) =
self.add_one_start(nfa_start, Start::WordByte)?;
- self.dfa.set_start_state(Start::WordByte, pattern_id, id);
+ self.dfa.set_start_state(anchored, Start::WordByte, id);
if is_new {
dfa_state_ids.push(id);
}
}
- if !self.nfa.has_any_anchor() {
- self.dfa.set_start_state(Start::Text, pattern_id, id);
- self.dfa.set_start_state(Start::Line, pattern_id, id);
+ if !self.nfa.look_set_prefix_any().contains_anchor() {
+ self.dfa.set_start_state(anchored, Start::Text, id);
+ self.dfa.set_start_state(anchored, Start::LineLF, id);
+ self.dfa.set_start_state(anchored, Start::LineCR, id);
+ self.dfa.set_start_state(
+ anchored,
+ Start::CustomLineTerminator,
+ id,
+ );
} else {
let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
- self.dfa.set_start_state(Start::Text, pattern_id, id);
+ self.dfa.set_start_state(anchored, Start::Text, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
+ self.dfa.set_start_state(anchored, Start::LineLF, id);
if is_new {
dfa_state_ids.push(id);
}
- let (id, is_new) = self.add_one_start(nfa_start, Start::Line)?;
- self.dfa.set_start_state(Start::Line, pattern_id, id);
+ let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
+ self.dfa.set_start_state(anchored, Start::LineCR, id);
+ if is_new {
+ dfa_state_ids.push(id);
+ }
+
+ let (id, is_new) =
+ self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
+ self.dfa.set_start_state(
+ anchored,
+ Start::CustomLineTerminator,
+ id,
+ );
if is_new {
dfa_state_ids.push(id);
}
@@ -414,13 +463,14 @@ impl<'a> Runner<'a> {
&mut self,
nfa_start: StateID,
start: Start,
- ) -> Result<(StateID, bool), Error> {
+ ) -> Result<(StateID, bool), BuildError> {
// Compute the look-behind assertions that are true in this starting
// configuration, and the determine the epsilon closure. While
// computing the epsilon closure, we only follow condiional epsilon
- // transitions that satisfy the look-behind assertions in 'facts'.
+ // transitions that satisfy the look-behind assertions in 'look_have'.
let mut builder_matches = self.get_state_builder().into_matches();
util::determinize::set_lookbehind_from_start(
+ self.nfa,
&start,
&mut builder_matches,
);
@@ -428,7 +478,7 @@ impl<'a> Runner<'a> {
util::determinize::epsilon_closure(
self.nfa,
nfa_start,
- *builder_matches.look_have(),
+ builder_matches.look_have(),
&mut self.stack,
&mut self.sparses.set1,
);
@@ -455,7 +505,7 @@ impl<'a> Runner<'a> {
fn maybe_add_state(
&mut self,
builder: StateBuilderNFA,
- ) -> Result<(StateID, bool), Error> {
+ ) -> Result<(StateID, bool), BuildError> {
if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
// Since we have a cached state, put the constructed state's
// memory back into our scratch space, so that it can be reused.
@@ -476,7 +526,7 @@ impl<'a> Runner<'a> {
fn add_state(
&mut self,
builder: StateBuilderNFA,
- ) -> Result<StateID, Error> {
+ ) -> Result<StateID, BuildError> {
let id = self.dfa.add_empty_state()?;
if !self.config.quit.is_empty() {
for b in self.config.quit.iter() {
@@ -489,19 +539,21 @@ impl<'a> Runner<'a> {
}
let state = builder.to_state();
// States use reference counting internally, so we only need to count
- // their memroy usage once.
+ // their memory usage once.
self.memory_usage_state += state.memory_usage();
self.builder_states.push(state.clone());
self.cache.insert(state, id);
self.put_state_builder(builder);
if let Some(limit) = self.config.dfa_size_limit {
if self.dfa.memory_usage() > limit {
- return Err(Error::dfa_exceeded_size_limit(limit));
+ return Err(BuildError::dfa_exceeded_size_limit(limit));
}
}
if let Some(limit) = self.config.determinize_size_limit {
if self.memory_usage() > limit {
- return Err(Error::determinize_exceeded_size_limit(limit));
+ return Err(BuildError::determinize_exceeded_size_limit(
+ limit,
+ ));
}
}
Ok(id)