summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 18:31:36 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-30 18:31:36 +0000
commite02c5b5930c2c9ba3e5423fe12e2ef0155017297 (patch)
treefd60ebbbb5299e16e5fca8c773ddb74f764760db /vendor/regex-automata-0.2.0/src/util/determinize/mod.rs
parentAdding debian version 1.73.0+dfsg1-1. (diff)
downloadrustc-e02c5b5930c2c9ba3e5423fe12e2ef0155017297.tar.xz
rustc-e02c5b5930c2c9ba3e5423fe12e2ef0155017297.zip
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata-0.2.0/src/util/determinize/mod.rs')
-rw-r--r--vendor/regex-automata-0.2.0/src/util/determinize/mod.rs493
1 files changed, 493 insertions, 0 deletions
diff --git a/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs b/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs
new file mode 100644
index 000000000..b384de8e1
--- /dev/null
+++ b/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs
@@ -0,0 +1,493 @@
+/*!
+This module contains types and routines for implementing determinization.
+
+In this crate, there are at least two places where we implement
+determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
+lazily compiled DFAs in the `hybrid` module. The stuff in this module
+corresponds to the things that are in common between these implementations.
+
+There are three broad things that our implementations of determinization have
+in common, as defined by this module:
+
+* The classification of start states. That is, whether we're dealing with
+word boundaries, line boundaries, etc., is all the same. This also includes
+the look-behind assertions that are satisfied by each starting state
+classification.
+
+* The representation of DFA states as sets of NFA states, including
+convenience types for building these DFA states that are amenable to reusing
+allocations.
+
+* Routines for the "classical" parts of determinization: computing the
+epsilon closure, tracking match states (with corresponding pattern IDs, since
+we support multi-pattern finite automata) and, of course, computing the
+transition function between states for units of input.
+
+I did consider a couple of alternatives to this particular form of code reuse:
+
+1. Don't do any code reuse. The problem here is that we *really* want both
+forms of determinization to do exactly identical things when it comes to
+their handling of NFA states. While our tests generally ensure this, the code
+is tricky and large enough where not reusing code is a pretty big bummer.
+
+2. Implement all of determinization once and make it generic over fully
+compiled DFAs and lazily compiled DFAs. While I didn't actually try this
+approach, my instinct is that it would be more complex than is needed here.
+And the interface required would be pretty hairy. Instead, I think splitting
+it into logical sub-components works better.
+*/
+
+use alloc::vec::Vec;
+
+pub(crate) use self::state::{
+ State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
+};
+
+use crate::{
+ nfa::thompson::{self, Look, LookSet},
+ util::{
+ alphabet,
+ id::StateID,
+ matchtypes::MatchKind,
+ sparse_set::{SparseSet, SparseSets},
+ start::Start,
+ },
+};
+
+mod state;
+
+/// Compute the set of all eachable NFA states, including the full epsilon
+/// closure, from a DFA state for a single unit of input. The set of reachable
+/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
+/// also includes any look-behind assertions satisfied by `unit`, in addition
+/// to whether it is a match state. For multi-pattern DFAs, the builder will
+/// also include the pattern IDs that match (in the order seen).
+///
+/// `nfa` must be able to resolve any NFA state in `state` and any NFA state
+/// reachable via the epsilon closure of any NFA state in `state`. `sparses`
+/// must have capacity equivalent to `nfa.len()`.
+///
+/// `match_kind` should correspond to the match semantics implemented by the
+/// DFA being built. Generally speaking, for leftmost-first match semantics,
+/// states that appear after the first NFA match state will not be included in
+/// the `StateBuilderNFA` returned since they are impossible to visit.
+///
+/// `sparses` is used as scratch space for NFA traversal. Other than their
+/// capacity requirements (detailed above), there are no requirements on what's
+/// contained within them (if anything). Similarly, what's inside of them once
+/// this routine returns is unspecified.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+///
+/// `state` corresponds to the current DFA state on which one wants to compute
+/// the transition for the input `unit`.
+///
+/// `empty_builder` corresponds to the builder allocation to use to produce a
+/// complete `StateBuilderNFA` state. If the state is not needed (or is already
+/// cached), then it can be cleared and reused without needing to create a new
+/// `State`. The `StateBuilderNFA` state returned is final and ready to be
+/// turned into a `State` if necessary.
+pub(crate) fn next(
+ nfa: &thompson::NFA,
+ match_kind: MatchKind,
+ sparses: &mut SparseSets,
+ stack: &mut Vec<StateID>,
+ state: &State,
+ unit: alphabet::Unit,
+ empty_builder: StateBuilderEmpty,
+) -> StateBuilderNFA {
+ sparses.clear();
+
+ // Put the NFA state IDs into a sparse set in case we need to
+ // re-compute their epsilon closure.
+ //
+ // Doing this state shuffling is technically not necessary unless some
+ // kind of look-around is used in the DFA. Some ad hoc experiments
+ // suggested that avoiding this didn't lead to much of an improvement,
+ // but perhaps more rigorous experimentation should be done. And in
+ // particular, avoiding this check requires some light refactoring of
+ // the code below.
+ state.iter_nfa_state_ids(|nfa_id| {
+ sparses.set1.insert(nfa_id);
+ });
+
+ // Compute look-ahead assertions originating from the current state.
+ // Based on the input unit we're transitioning over, some additional
+ // set of assertions may be true. Thus, we re-compute this state's
+ // epsilon closure (but only if necessary).
+ if !state.look_need().is_empty() {
+ // Add look-ahead assertions that are now true based on the current
+ // input unit.
+ let mut look_have = state.look_have().clone();
+ match unit.as_u8() {
+ Some(b'\n') => {
+ look_have.insert(Look::EndLine);
+ }
+ Some(_) => {}
+ None => {
+ look_have.insert(Look::EndText);
+ look_have.insert(Look::EndLine);
+ }
+ }
+ if state.is_from_word() == unit.is_word_byte() {
+ look_have.insert(Look::WordBoundaryUnicodeNegate);
+ look_have.insert(Look::WordBoundaryAsciiNegate);
+ } else {
+ look_have.insert(Look::WordBoundaryUnicode);
+ look_have.insert(Look::WordBoundaryAscii);
+ }
+ // If we have new assertions satisfied that are among the set of
+ // assertions that exist in this state (that is, just because
+ // we added an EndLine assertion above doesn't mean there is an
+ // EndLine conditional epsilon transition in this state), then we
+ // re-compute this state's epsilon closure using the updated set of
+ // assertions.
+ if !look_have
+ .subtract(state.look_have())
+ .intersect(state.look_need())
+ .is_empty()
+ {
+ for nfa_id in &sparses.set1 {
+ epsilon_closure(
+ nfa,
+ nfa_id,
+ look_have,
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ sparses.swap();
+ sparses.set2.clear();
+ }
+ }
+
+ // Convert our empty builder into one that can record assertions and match
+ // pattern IDs.
+ let mut builder = empty_builder.into_matches();
+ // Set whether the StartLine look-behind assertion is true for this
+ // transition or not. The look-behind assertion for ASCII word boundaries
+ // is handled below.
+ if nfa.has_any_anchor() {
+ if unit.as_u8().map_or(false, |b| b == b'\n') {
+ // Why only handle StartLine here and not StartText? That's
+ // because StartText can only impact the starting state, which
+ // is speical cased in start state handling.
+ builder.look_have().insert(Look::StartLine);
+ }
+ }
+ for nfa_id in &sparses.set1 {
+ match *nfa.state(nfa_id) {
+ thompson::State::Union { .. }
+ | thompson::State::Fail
+ | thompson::State::Look { .. }
+ | thompson::State::Capture { .. } => {}
+ thompson::State::Match { id } => {
+ // Notice here that we are calling the NEW state a match
+ // state if the OLD state we are transitioning from
+ // contains an NFA match state. This is precisely how we
+ // delay all matches by one byte and also what therefore
+ // guarantees that starting states cannot be match states.
+ //
+ // If we didn't delay matches by one byte, then whether
+ // a DFA is a matching state or not would be determined
+ // by whether one of its own constituent NFA states
+ // was a match state. (And that would be done in
+ // 'add_nfa_states'.)
+ //
+ // Also, 'add_match_pattern_id' requires that callers never
+ // pass duplicative pattern IDs. We do in fact uphold that
+ // guarantee here, but it's subtle. In particular, a Thompson
+ // NFA guarantees that each pattern has exactly one match
+ // state. Moreover, since we're iterating over the NFA state
+ // IDs in a set, we are guarateed not to have any duplicative
+ // match states. Thus, it is impossible to add the same pattern
+ // ID more than once.
+ builder.add_match_pattern_id(id);
+ if !match_kind.continue_past_first_match() {
+ break;
+ }
+ }
+ thompson::State::Range { range: ref r } => {
+ if r.matches_unit(unit) {
+ epsilon_closure(
+ nfa,
+ r.next,
+ *builder.look_have(),
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ }
+ thompson::State::Sparse(ref sparse) => {
+ if let Some(next) = sparse.matches_unit(unit) {
+ epsilon_closure(
+ nfa,
+ next,
+ *builder.look_have(),
+ stack,
+ &mut sparses.set2,
+ );
+ }
+ }
+ }
+ }
+ // We only set the word byte if there's a word boundary look-around
+ // anywhere in this regex. Otherwise, there's no point in bloating the
+ // number of states if we don't have one.
+ //
+ // We also only set it when the state has a non-zero number of NFA states.
+ // Otherwise, we could wind up with states that *should* be DEAD states
+ // but are otherwise distinct from DEAD states because of this look-behind
+ // assertion being set. While this can't technically impact correctness *in
+ // theory*, it can create pathological DFAs that consume input until EOI or
+ // a quit byte is seen. Consuming until EOI isn't a correctness problem,
+ // but a (serious) perf problem. Hitting a quit byte, however, could be a
+ // correctness problem since it could cause search routines to report an
+ // error instead of a detected match once the quit state is entered. (The
+ // search routine could be made to be a bit smarter by reporting a match
+ // if one was detected once it enters a quit state (and indeed, the search
+ // routines in this crate do just that), but it seems better to prevent
+ // these things by construction if possible.)
+ if nfa.has_word_boundary()
+ && unit.is_word_byte()
+ && !sparses.set2.is_empty()
+ {
+ builder.set_is_from_word();
+ }
+ let mut builder_nfa = builder.into_nfa();
+ add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
+ builder_nfa
+}
+
+/// Compute the epsilon closure for the given NFA state. The epsilon closure
+/// consists of all NFA state IDs, including `start_nfa_id`, that can be
+/// reached from `start_nfa_id` without consuming any input. These state IDs
+/// are written to `set` in the order they are visited, but only if they are
+/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
+/// given.
+///
+/// `look_have` consists of the satisfied assertions at the current
+/// position. For conditional look-around epsilon transitions, these are
+/// only followed if they are satisfied by `look_have`.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+pub(crate) fn epsilon_closure(
+ nfa: &thompson::NFA,
+ start_nfa_id: StateID,
+ look_have: LookSet,
+ stack: &mut Vec<StateID>,
+ set: &mut SparseSet,
+) {
+ assert!(stack.is_empty());
+ // If this isn't an epsilon state, then the epsilon closure is always just
+ // itself, so there's no need to spin up the machinery below to handle it.
+ if !nfa.state(start_nfa_id).is_epsilon() {
+ set.insert(start_nfa_id);
+ return;
+ }
+
+ stack.push(start_nfa_id);
+ while let Some(mut id) = stack.pop() {
+ // In many cases, we can avoid stack operations when an NFA state only
+ // adds one new state to visit. In that case, we just set our ID to
+ // that state and mush on. We only use the stack when an NFA state
+ // introduces multiple new states to visit.
+ loop {
+ // Insert this NFA state, and if it's already in the set and thus
+ // already visited, then we can move on to the next one.
+ if !set.insert(id) {
+ break;
+ }
+ match *nfa.state(id) {
+ thompson::State::Range { .. }
+ | thompson::State::Sparse { .. }
+ | thompson::State::Fail
+ | thompson::State::Match { .. } => break,
+ thompson::State::Look { look, next } => {
+ if !look_have.contains(look) {
+ break;
+ }
+ id = next;
+ }
+ thompson::State::Union { ref alternates } => {
+ id = match alternates.get(0) {
+ None => break,
+ Some(&id) => id,
+ };
+ // We need to process our alternates in order to preserve
+ // match preferences, so put the earliest alternates closer
+ // to the top of the stack.
+ stack.extend(alternates[1..].iter().rev());
+ }
+ thompson::State::Capture { next, .. } => {
+ id = next;
+ }
+ }
+ }
+ }
+}
+
+/// Add the NFA state IDs in the given `set` to the given DFA builder state.
+/// The order in which states are added corresponds to the order in which they
+/// were added to `set`.
+///
+/// The DFA builder state given should already have its complete set of match
+/// pattern IDs added (if any) and any look-behind assertions (StartLine,
+/// StartText and whether this state is being generated for a transition over a
+/// word byte when applicable) that are true immediately prior to transitioning
+/// into this state (via `builder.look_have()`). The match pattern IDs should
+/// correspond to matches that occured on the previous transition, since all
+/// matches are delayed by one byte. The things that should _not_ be set are
+/// look-ahead assertions (EndLine, EndText and whether the next byte is a
+/// word byte or not). The builder state should also not have anything in
+/// `look_need` set, as this routine will compute that for you.
+///
+/// The given NFA should be able to resolve all identifiers in `set` to a
+/// particular NFA state. Additionally, `set` must have capacity equivalent
+/// to `nfa.len()`.
+pub(crate) fn add_nfa_states(
+ nfa: &thompson::NFA,
+ set: &SparseSet,
+ builder: &mut StateBuilderNFA,
+) {
+ for nfa_id in set {
+ match *nfa.state(nfa_id) {
+ thompson::State::Range { .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ }
+ thompson::State::Sparse { .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ }
+ thompson::State::Look { look, .. } => {
+ builder.add_nfa_state_id(nfa_id);
+ builder.look_need().insert(look);
+ }
+ thompson::State::Union { .. }
+ | thompson::State::Capture { .. } => {
+ // Pure epsilon transitions don't need to be tracked
+ // as part of the DFA state. Tracking them is actually
+ // superfluous; they won't cause any harm other than making
+ // determinization slower.
+ //
+ // Why aren't these needed? Well, in an NFA, epsilon
+ // transitions are really just jumping points to other
+ // states. So once you hit an epsilon transition, the same
+ // set of resulting states always appears. Therefore,
+ // putting them in a DFA's set of ordered NFA states is
+ // strictly redundant.
+ //
+ // Look-around states are also epsilon transitions, but
+ // they are *conditional*. So their presence could be
+ // discriminatory, and thus, they are tracked above.
+ //
+ // But wait... why are epsilon states in our `set` in the
+ // first place? Why not just leave them out? They're in
+ // our `set` because it was generated by computing an
+ // epsilon closure, and we want to keep track of all states
+ // we visited to avoid re-visiting them. In exchange, we
+ // have to do this second iteration over our collected
+ // states to finalize our DFA state.
+ //
+ // Note that this optimization requires that we re-compute
+ // the epsilon closure to account for look-ahead in 'next'
+ // *only when necessary*. Namely, only when the set of
+ // look-around assertions changes and only when those
+ // changes are within the set of assertions that are
+ // needed in order to step through the closure correctly.
+ // Otherwise, if we re-do the epsilon closure needlessly,
+ // it could change based on the fact that we are omitting
+ // epsilon states here.
+ }
+ thompson::State::Fail => {
+ break;
+ }
+ thompson::State::Match { .. } => {
+ // Normally, the NFA match state doesn't actually need to
+ // be inside the DFA state. But since we delay matches by
+ // one byte, the matching DFA state corresponds to states
+ // that transition from the one we're building here. And
+ // the way we detect those cases is by looking for an NFA
+ // match state. See 'next' for how this is handled.
+ builder.add_nfa_state_id(nfa_id);
+ }
+ }
+ }
+ // If we know this state contains no look-around assertions, then
+ // there's no reason to track which look-around assertions were
+ // satisfied when this state was created.
+ if builder.look_need().is_empty() {
+ builder.look_have().clear();
+ }
+}
+
+/// Sets the appropriate look-behind assertions on the given state based on
+/// this starting configuration.
+pub(crate) fn set_lookbehind_from_start(
+ start: &Start,
+ builder: &mut StateBuilderMatches,
+) {
+ match *start {
+ Start::NonWordByte => {}
+ Start::WordByte => {
+ builder.set_is_from_word();
+ }
+ Start::Text => {
+ builder.look_have().insert(Look::StartText);
+ builder.look_have().insert(Look::StartLine);
+ }
+ Start::Line => {
+ builder.look_have().insert(Look::StartLine);
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::Start;
+
+ #[test]
+ #[should_panic]
+ fn start_fwd_bad_range() {
+ Start::from_position_fwd(&[], 0, 1);
+ }
+
+ #[test]
+ #[should_panic]
+ fn start_rev_bad_range() {
+ Start::from_position_rev(&[], 0, 1);
+ }
+
+ #[test]
+ fn start_fwd() {
+ let f = Start::from_position_fwd;
+
+ assert_eq!(Start::Text, f(&[], 0, 0));
+ assert_eq!(Start::Text, f(b"abc", 0, 3));
+ assert_eq!(Start::Text, f(b"\nabc", 0, 3));
+
+ assert_eq!(Start::Line, f(b"\nabc", 1, 3));
+
+ assert_eq!(Start::WordByte, f(b"abc", 1, 3));
+
+ assert_eq!(Start::NonWordByte, f(b" abc", 1, 3));
+ }
+
+ #[test]
+ fn start_rev() {
+ let f = Start::from_position_rev;
+
+ assert_eq!(Start::Text, f(&[], 0, 0));
+ assert_eq!(Start::Text, f(b"abc", 0, 3));
+ assert_eq!(Start::Text, f(b"abc\n", 0, 4));
+
+ assert_eq!(Start::Line, f(b"abc\nz", 0, 3));
+
+ assert_eq!(Start::WordByte, f(b"abc", 0, 2));
+
+ assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3));
+ }
+}