diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:36 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-30 18:31:36 +0000 |
commit | e02c5b5930c2c9ba3e5423fe12e2ef0155017297 (patch) | |
tree | fd60ebbbb5299e16e5fca8c773ddb74f764760db /vendor/regex-automata-0.2.0/src/util/determinize/mod.rs | |
parent | Adding debian version 1.73.0+dfsg1-1. (diff) | |
download | rustc-e02c5b5930c2c9ba3e5423fe12e2ef0155017297.tar.xz rustc-e02c5b5930c2c9ba3e5423fe12e2ef0155017297.zip |
Merging upstream version 1.74.1+dfsg1.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'vendor/regex-automata-0.2.0/src/util/determinize/mod.rs')
-rw-r--r-- | vendor/regex-automata-0.2.0/src/util/determinize/mod.rs | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs b/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs new file mode 100644 index 000000000..b384de8e1 --- /dev/null +++ b/vendor/regex-automata-0.2.0/src/util/determinize/mod.rs @@ -0,0 +1,493 @@ +/*! +This module contains types and routines for implementing determinization. + +In this crate, there are at least two places where we implement +determinization: fully ahead-of-time compiled DFAs in the `dfa` module and +lazily compiled DFAs in the `hybrid` module. The stuff in this module +corresponds to the things that are in common between these implementations. + +There are three broad things that our implementations of determinization have +in common, as defined by this module: + +* The classification of start states. That is, whether we're dealing with +word boundaries, line boundaries, etc., is all the same. This also includes +the look-behind assertions that are satisfied by each starting state +classification. + +* The representation of DFA states as sets of NFA states, including +convenience types for building these DFA states that are amenable to reusing +allocations. + +* Routines for the "classical" parts of determinization: computing the +epsilon closure, tracking match states (with corresponding pattern IDs, since +we support multi-pattern finite automata) and, of course, computing the +transition function between states for units of input. + +I did consider a couple of alternatives to this particular form of code reuse: + +1. Don't do any code reuse. The problem here is that we *really* want both +forms of determinization to do exactly identical things when it comes to +their handling of NFA states. While our tests generally ensure this, the code +is tricky and large enough where not reusing code is a pretty big bummer. + +2. Implement all of determinization once and make it generic over fully +compiled DFAs and lazily compiled DFAs. While I didn't actually try this +approach, my instinct is that it would be more complex than is needed here. +And the interface required would be pretty hairy. Instead, I think splitting +it into logical sub-components works better. +*/ + +use alloc::vec::Vec; + +pub(crate) use self::state::{ + State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA, +}; + +use crate::{ + nfa::thompson::{self, Look, LookSet}, + util::{ + alphabet, + id::StateID, + matchtypes::MatchKind, + sparse_set::{SparseSet, SparseSets}, + start::Start, + }, +}; + +mod state; + +/// Compute the set of all eachable NFA states, including the full epsilon +/// closure, from a DFA state for a single unit of input. The set of reachable +/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned +/// also includes any look-behind assertions satisfied by `unit`, in addition +/// to whether it is a match state. For multi-pattern DFAs, the builder will +/// also include the pattern IDs that match (in the order seen). +/// +/// `nfa` must be able to resolve any NFA state in `state` and any NFA state +/// reachable via the epsilon closure of any NFA state in `state`. `sparses` +/// must have capacity equivalent to `nfa.len()`. +/// +/// `match_kind` should correspond to the match semantics implemented by the +/// DFA being built. Generally speaking, for leftmost-first match semantics, +/// states that appear after the first NFA match state will not be included in +/// the `StateBuilderNFA` returned since they are impossible to visit. +/// +/// `sparses` is used as scratch space for NFA traversal. Other than their +/// capacity requirements (detailed above), there are no requirements on what's +/// contained within them (if anything). Similarly, what's inside of them once +/// this routine returns is unspecified. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +/// +/// `state` corresponds to the current DFA state on which one wants to compute +/// the transition for the input `unit`. +/// +/// `empty_builder` corresponds to the builder allocation to use to produce a +/// complete `StateBuilderNFA` state. If the state is not needed (or is already +/// cached), then it can be cleared and reused without needing to create a new +/// `State`. The `StateBuilderNFA` state returned is final and ready to be +/// turned into a `State` if necessary. +pub(crate) fn next( + nfa: &thompson::NFA, + match_kind: MatchKind, + sparses: &mut SparseSets, + stack: &mut Vec<StateID>, + state: &State, + unit: alphabet::Unit, + empty_builder: StateBuilderEmpty, +) -> StateBuilderNFA { + sparses.clear(); + + // Put the NFA state IDs into a sparse set in case we need to + // re-compute their epsilon closure. + // + // Doing this state shuffling is technically not necessary unless some + // kind of look-around is used in the DFA. Some ad hoc experiments + // suggested that avoiding this didn't lead to much of an improvement, + // but perhaps more rigorous experimentation should be done. And in + // particular, avoiding this check requires some light refactoring of + // the code below. + state.iter_nfa_state_ids(|nfa_id| { + sparses.set1.insert(nfa_id); + }); + + // Compute look-ahead assertions originating from the current state. + // Based on the input unit we're transitioning over, some additional + // set of assertions may be true. Thus, we re-compute this state's + // epsilon closure (but only if necessary). + if !state.look_need().is_empty() { + // Add look-ahead assertions that are now true based on the current + // input unit. + let mut look_have = state.look_have().clone(); + match unit.as_u8() { + Some(b'\n') => { + look_have.insert(Look::EndLine); + } + Some(_) => {} + None => { + look_have.insert(Look::EndText); + look_have.insert(Look::EndLine); + } + } + if state.is_from_word() == unit.is_word_byte() { + look_have.insert(Look::WordBoundaryUnicodeNegate); + look_have.insert(Look::WordBoundaryAsciiNegate); + } else { + look_have.insert(Look::WordBoundaryUnicode); + look_have.insert(Look::WordBoundaryAscii); + } + // If we have new assertions satisfied that are among the set of + // assertions that exist in this state (that is, just because + // we added an EndLine assertion above doesn't mean there is an + // EndLine conditional epsilon transition in this state), then we + // re-compute this state's epsilon closure using the updated set of + // assertions. + if !look_have + .subtract(state.look_have()) + .intersect(state.look_need()) + .is_empty() + { + for nfa_id in &sparses.set1 { + epsilon_closure( + nfa, + nfa_id, + look_have, + stack, + &mut sparses.set2, + ); + } + sparses.swap(); + sparses.set2.clear(); + } + } + + // Convert our empty builder into one that can record assertions and match + // pattern IDs. + let mut builder = empty_builder.into_matches(); + // Set whether the StartLine look-behind assertion is true for this + // transition or not. The look-behind assertion for ASCII word boundaries + // is handled below. + if nfa.has_any_anchor() { + if unit.as_u8().map_or(false, |b| b == b'\n') { + // Why only handle StartLine here and not StartText? That's + // because StartText can only impact the starting state, which + // is speical cased in start state handling. + builder.look_have().insert(Look::StartLine); + } + } + for nfa_id in &sparses.set1 { + match *nfa.state(nfa_id) { + thompson::State::Union { .. } + | thompson::State::Fail + | thompson::State::Look { .. } + | thompson::State::Capture { .. } => {} + thompson::State::Match { id } => { + // Notice here that we are calling the NEW state a match + // state if the OLD state we are transitioning from + // contains an NFA match state. This is precisely how we + // delay all matches by one byte and also what therefore + // guarantees that starting states cannot be match states. + // + // If we didn't delay matches by one byte, then whether + // a DFA is a matching state or not would be determined + // by whether one of its own constituent NFA states + // was a match state. (And that would be done in + // 'add_nfa_states'.) + // + // Also, 'add_match_pattern_id' requires that callers never + // pass duplicative pattern IDs. We do in fact uphold that + // guarantee here, but it's subtle. In particular, a Thompson + // NFA guarantees that each pattern has exactly one match + // state. Moreover, since we're iterating over the NFA state + // IDs in a set, we are guarateed not to have any duplicative + // match states. Thus, it is impossible to add the same pattern + // ID more than once. + builder.add_match_pattern_id(id); + if !match_kind.continue_past_first_match() { + break; + } + } + thompson::State::Range { range: ref r } => { + if r.matches_unit(unit) { + epsilon_closure( + nfa, + r.next, + *builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Sparse(ref sparse) => { + if let Some(next) = sparse.matches_unit(unit) { + epsilon_closure( + nfa, + next, + *builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + } + } + // We only set the word byte if there's a word boundary look-around + // anywhere in this regex. Otherwise, there's no point in bloating the + // number of states if we don't have one. + // + // We also only set it when the state has a non-zero number of NFA states. + // Otherwise, we could wind up with states that *should* be DEAD states + // but are otherwise distinct from DEAD states because of this look-behind + // assertion being set. While this can't technically impact correctness *in + // theory*, it can create pathological DFAs that consume input until EOI or + // a quit byte is seen. Consuming until EOI isn't a correctness problem, + // but a (serious) perf problem. Hitting a quit byte, however, could be a + // correctness problem since it could cause search routines to report an + // error instead of a detected match once the quit state is entered. (The + // search routine could be made to be a bit smarter by reporting a match + // if one was detected once it enters a quit state (and indeed, the search + // routines in this crate do just that), but it seems better to prevent + // these things by construction if possible.) + if nfa.has_word_boundary() + && unit.is_word_byte() + && !sparses.set2.is_empty() + { + builder.set_is_from_word(); + } + let mut builder_nfa = builder.into_nfa(); + add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); + builder_nfa +} + +/// Compute the epsilon closure for the given NFA state. The epsilon closure +/// consists of all NFA state IDs, including `start_nfa_id`, that can be +/// reached from `start_nfa_id` without consuming any input. These state IDs +/// are written to `set` in the order they are visited, but only if they are +/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA +/// given. +/// +/// `look_have` consists of the satisfied assertions at the current +/// position. For conditional look-around epsilon transitions, these are +/// only followed if they are satisfied by `look_have`. +/// +/// `stack` must have length 0. It is used as scratch space for depth first +/// traversal. After returning, it is guaranteed that `stack` will have length +/// 0. +pub(crate) fn epsilon_closure( + nfa: &thompson::NFA, + start_nfa_id: StateID, + look_have: LookSet, + stack: &mut Vec<StateID>, + set: &mut SparseSet, +) { + assert!(stack.is_empty()); + // If this isn't an epsilon state, then the epsilon closure is always just + // itself, so there's no need to spin up the machinery below to handle it. + if !nfa.state(start_nfa_id).is_epsilon() { + set.insert(start_nfa_id); + return; + } + + stack.push(start_nfa_id); + while let Some(mut id) = stack.pop() { + // In many cases, we can avoid stack operations when an NFA state only + // adds one new state to visit. In that case, we just set our ID to + // that state and mush on. We only use the stack when an NFA state + // introduces multiple new states to visit. + loop { + // Insert this NFA state, and if it's already in the set and thus + // already visited, then we can move on to the next one. + if !set.insert(id) { + break; + } + match *nfa.state(id) { + thompson::State::Range { .. } + | thompson::State::Sparse { .. } + | thompson::State::Fail + | thompson::State::Match { .. } => break, + thompson::State::Look { look, next } => { + if !look_have.contains(look) { + break; + } + id = next; + } + thompson::State::Union { ref alternates } => { + id = match alternates.get(0) { + None => break, + Some(&id) => id, + }; + // We need to process our alternates in order to preserve + // match preferences, so put the earliest alternates closer + // to the top of the stack. + stack.extend(alternates[1..].iter().rev()); + } + thompson::State::Capture { next, .. } => { + id = next; + } + } + } + } +} + +/// Add the NFA state IDs in the given `set` to the given DFA builder state. +/// The order in which states are added corresponds to the order in which they +/// were added to `set`. +/// +/// The DFA builder state given should already have its complete set of match +/// pattern IDs added (if any) and any look-behind assertions (StartLine, +/// StartText and whether this state is being generated for a transition over a +/// word byte when applicable) that are true immediately prior to transitioning +/// into this state (via `builder.look_have()`). The match pattern IDs should +/// correspond to matches that occured on the previous transition, since all +/// matches are delayed by one byte. The things that should _not_ be set are +/// look-ahead assertions (EndLine, EndText and whether the next byte is a +/// word byte or not). The builder state should also not have anything in +/// `look_need` set, as this routine will compute that for you. +/// +/// The given NFA should be able to resolve all identifiers in `set` to a +/// particular NFA state. Additionally, `set` must have capacity equivalent +/// to `nfa.len()`. +pub(crate) fn add_nfa_states( + nfa: &thompson::NFA, + set: &SparseSet, + builder: &mut StateBuilderNFA, +) { + for nfa_id in set { + match *nfa.state(nfa_id) { + thompson::State::Range { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Sparse { .. } => { + builder.add_nfa_state_id(nfa_id); + } + thompson::State::Look { look, .. } => { + builder.add_nfa_state_id(nfa_id); + builder.look_need().insert(look); + } + thompson::State::Union { .. } + | thompson::State::Capture { .. } => { + // Pure epsilon transitions don't need to be tracked + // as part of the DFA state. Tracking them is actually + // superfluous; they won't cause any harm other than making + // determinization slower. + // + // Why aren't these needed? Well, in an NFA, epsilon + // transitions are really just jumping points to other + // states. So once you hit an epsilon transition, the same + // set of resulting states always appears. Therefore, + // putting them in a DFA's set of ordered NFA states is + // strictly redundant. + // + // Look-around states are also epsilon transitions, but + // they are *conditional*. So their presence could be + // discriminatory, and thus, they are tracked above. + // + // But wait... why are epsilon states in our `set` in the + // first place? Why not just leave them out? They're in + // our `set` because it was generated by computing an + // epsilon closure, and we want to keep track of all states + // we visited to avoid re-visiting them. In exchange, we + // have to do this second iteration over our collected + // states to finalize our DFA state. + // + // Note that this optimization requires that we re-compute + // the epsilon closure to account for look-ahead in 'next' + // *only when necessary*. Namely, only when the set of + // look-around assertions changes and only when those + // changes are within the set of assertions that are + // needed in order to step through the closure correctly. + // Otherwise, if we re-do the epsilon closure needlessly, + // it could change based on the fact that we are omitting + // epsilon states here. + } + thompson::State::Fail => { + break; + } + thompson::State::Match { .. } => { + // Normally, the NFA match state doesn't actually need to + // be inside the DFA state. But since we delay matches by + // one byte, the matching DFA state corresponds to states + // that transition from the one we're building here. And + // the way we detect those cases is by looking for an NFA + // match state. See 'next' for how this is handled. + builder.add_nfa_state_id(nfa_id); + } + } + } + // If we know this state contains no look-around assertions, then + // there's no reason to track which look-around assertions were + // satisfied when this state was created. + if builder.look_need().is_empty() { + builder.look_have().clear(); + } +} + +/// Sets the appropriate look-behind assertions on the given state based on +/// this starting configuration. +pub(crate) fn set_lookbehind_from_start( + start: &Start, + builder: &mut StateBuilderMatches, +) { + match *start { + Start::NonWordByte => {} + Start::WordByte => { + builder.set_is_from_word(); + } + Start::Text => { + builder.look_have().insert(Look::StartText); + builder.look_have().insert(Look::StartLine); + } + Start::Line => { + builder.look_have().insert(Look::StartLine); + } + } +} + +#[cfg(test)] +mod tests { + use super::Start; + + #[test] + #[should_panic] + fn start_fwd_bad_range() { + Start::from_position_fwd(&[], 0, 1); + } + + #[test] + #[should_panic] + fn start_rev_bad_range() { + Start::from_position_rev(&[], 0, 1); + } + + #[test] + fn start_fwd() { + let f = Start::from_position_fwd; + + assert_eq!(Start::Text, f(&[], 0, 0)); + assert_eq!(Start::Text, f(b"abc", 0, 3)); + assert_eq!(Start::Text, f(b"\nabc", 0, 3)); + + assert_eq!(Start::Line, f(b"\nabc", 1, 3)); + + assert_eq!(Start::WordByte, f(b"abc", 1, 3)); + + assert_eq!(Start::NonWordByte, f(b" abc", 1, 3)); + } + + #[test] + fn start_rev() { + let f = Start::from_position_rev; + + assert_eq!(Start::Text, f(&[], 0, 0)); + assert_eq!(Start::Text, f(b"abc", 0, 3)); + assert_eq!(Start::Text, f(b"abc\n", 0, 4)); + + assert_eq!(Start::Line, f(b"abc\nz", 0, 3)); + + assert_eq!(Start::WordByte, f(b"abc", 0, 2)); + + assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3)); + } +} |