diff options
Diffstat (limited to 'vendor/regex-automata/src/util/determinize/mod.rs')
-rw-r--r-- | vendor/regex-automata/src/util/determinize/mod.rs | 389 |
1 files changed, 253 insertions, 136 deletions
diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs index b384de8e1..30a82afb8 100644 --- a/vendor/regex-automata/src/util/determinize/mod.rs +++ b/vendor/regex-automata/src/util/determinize/mod.rs @@ -13,11 +13,9 @@ in common, as defined by this module: word boundaries, line boundaries, etc., is all the same. This also includes the look-behind assertions that are satisfied by each starting state classification. - * The representation of DFA states as sets of NFA states, including convenience types for building these DFA states that are amenable to reusing allocations. - * Routines for the "classical" parts of determinization: computing the epsilon closure, tracking match states (with corresponding pattern IDs, since we support multi-pattern finite automata) and, of course, computing the @@ -44,19 +42,21 @@ pub(crate) use self::state::{ }; use crate::{ - nfa::thompson::{self, Look, LookSet}, + nfa::thompson, util::{ alphabet, - id::StateID, - matchtypes::MatchKind, + look::{Look, LookSet}, + primitives::StateID, + search::MatchKind, sparse_set::{SparseSet, SparseSets}, start::Start, + utf8, }, }; mod state; -/// Compute the set of all eachable NFA states, including the full epsilon +/// Compute the set of all reachable NFA states, including the full epsilon /// closure, from a DFA state for a single unit of input. The set of reachable /// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned /// also includes any look-behind assertions satisfied by `unit`, in addition @@ -100,6 +100,15 @@ pub(crate) fn next( ) -> StateBuilderNFA { sparses.clear(); + // Whether the NFA is matched in reverse or not. We use this in some + // conditional logic for dealing with the exceptionally annoying CRLF-aware + // line anchors. + let rev = nfa.is_reverse(); + // The look-around matcher that our NFA is configured with. We don't + // actually use it to match look-around assertions, but we do need its + // configuration for constructing states consistent with how it matches. + let lookm = nfa.look_matcher(); + // Put the NFA state IDs into a sparse set in case we need to // re-compute their epsilon closure. // @@ -113,43 +122,66 @@ pub(crate) fn next( sparses.set1.insert(nfa_id); }); - // Compute look-ahead assertions originating from the current state. - // Based on the input unit we're transitioning over, some additional - // set of assertions may be true. Thus, we re-compute this state's - // epsilon closure (but only if necessary). + // Compute look-ahead assertions originating from the current state. Based + // on the input unit we're transitioning over, some additional set of + // assertions may be true. Thus, we re-compute this state's epsilon closure + // (but only if necessary). Notably, when we build a DFA state initially, + // we don't enable any look-ahead assertions because we don't know whether + // they're true or not at that point. if !state.look_need().is_empty() { // Add look-ahead assertions that are now true based on the current // input unit. let mut look_have = state.look_have().clone(); match unit.as_u8() { + Some(b'\r') => { + if !rev || !state.is_half_crlf() { + look_have = look_have.insert(Look::EndCRLF); + } + } Some(b'\n') => { - look_have.insert(Look::EndLine); + if rev || !state.is_half_crlf() { + look_have = look_have.insert(Look::EndCRLF); + } } Some(_) => {} None => { - look_have.insert(Look::EndText); - look_have.insert(Look::EndLine); + look_have = look_have.insert(Look::End); + look_have = look_have.insert(Look::EndLF); + look_have = look_have.insert(Look::EndCRLF); } } + if unit.is_byte(lookm.get_line_terminator()) { + look_have = look_have.insert(Look::EndLF); + } + if state.is_half_crlf() + && ((rev && !unit.is_byte(b'\r')) + || (!rev && !unit.is_byte(b'\n'))) + { + look_have = look_have.insert(Look::StartCRLF); + } if state.is_from_word() == unit.is_word_byte() { - look_have.insert(Look::WordBoundaryUnicodeNegate); - look_have.insert(Look::WordBoundaryAsciiNegate); + look_have = look_have.insert(Look::WordUnicodeNegate); + look_have = look_have.insert(Look::WordAsciiNegate); } else { - look_have.insert(Look::WordBoundaryUnicode); - look_have.insert(Look::WordBoundaryAscii); + look_have = look_have.insert(Look::WordUnicode); + look_have = look_have.insert(Look::WordAscii); } // If we have new assertions satisfied that are among the set of - // assertions that exist in this state (that is, just because - // we added an EndLine assertion above doesn't mean there is an - // EndLine conditional epsilon transition in this state), then we - // re-compute this state's epsilon closure using the updated set of - // assertions. + // assertions that exist in this state (that is, just because we added + // an EndLF assertion above doesn't mean there is an EndLF conditional + // epsilon transition in this state), then we re-compute this state's + // epsilon closure using the updated set of assertions. + // + // Note that since our DFA states omit unconditional epsilon + // transitions, this check is necessary for correctness. If we re-did + // the epsilon closure below needlessly, it could change based on the + // fact that we omitted epsilon states originally. if !look_have .subtract(state.look_have()) .intersect(state.look_need()) .is_empty() { - for nfa_id in &sparses.set1 { + for nfa_id in sparses.set1.iter() { epsilon_closure( nfa, nfa_id, @@ -166,24 +198,36 @@ pub(crate) fn next( // Convert our empty builder into one that can record assertions and match // pattern IDs. let mut builder = empty_builder.into_matches(); - // Set whether the StartLine look-behind assertion is true for this + // Set whether the StartLF look-behind assertion is true for this // transition or not. The look-behind assertion for ASCII word boundaries // is handled below. - if nfa.has_any_anchor() { - if unit.as_u8().map_or(false, |b| b == b'\n') { - // Why only handle StartLine here and not StartText? That's - // because StartText can only impact the starting state, which - // is speical cased in start state handling. - builder.look_have().insert(Look::StartLine); - } + if nfa.look_set_any().contains_anchor_line() + && unit.is_byte(lookm.get_line_terminator()) + { + // Why only handle StartLF here and not Start? That's because Start + // can only impact the starting state, which is special cased in + // start state handling. + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + // We also need to add StartCRLF to our assertions too, if we can. This + // is unfortunately a bit more complicated, because it depends on the + // direction of the search. In the forward direction, ^ matches after a + // \n, but in the reverse direction, ^ only matches after a \r. (This is + // further complicated by the fact that reverse a regex means changing a ^ + // to a $ and vice versa.) + if nfa.look_set_any().contains_anchor_crlf() + && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n'))) + { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); } - for nfa_id in &sparses.set1 { + for nfa_id in sparses.set1.iter() { match *nfa.state(nfa_id) { thompson::State::Union { .. } + | thompson::State::BinaryUnion { .. } | thompson::State::Fail | thompson::State::Look { .. } | thompson::State::Capture { .. } => {} - thompson::State::Match { id } => { + thompson::State::Match { pattern_id } => { // Notice here that we are calling the NEW state a match // state if the OLD state we are transitioning from // contains an NFA match state. This is precisely how we @@ -204,17 +248,25 @@ pub(crate) fn next( // IDs in a set, we are guarateed not to have any duplicative // match states. Thus, it is impossible to add the same pattern // ID more than once. - builder.add_match_pattern_id(id); + // + // N.B. We delay matches by 1 byte as a way to hack 1-byte + // look-around into DFA searches. This lets us support ^, $ + // and ASCII-only \b. The delay is also why we need a special + // "end-of-input" (EOI) sentinel and why we need to follow the + // EOI sentinel at the end of every search. This final EOI + // transition is necessary to report matches found at the end + // of a haystack. + builder.add_match_pattern_id(pattern_id); if !match_kind.continue_past_first_match() { break; } } - thompson::State::Range { range: ref r } => { - if r.matches_unit(unit) { + thompson::State::ByteRange { ref trans } => { + if trans.matches_unit(unit) { epsilon_closure( nfa, - r.next, - *builder.look_have(), + trans.next, + builder.look_have(), stack, &mut sparses.set2, ); @@ -225,7 +277,18 @@ pub(crate) fn next( epsilon_closure( nfa, next, - *builder.look_have(), + builder.look_have(), + stack, + &mut sparses.set2, + ); + } + } + thompson::State::Dense(ref dense) => { + if let Some(next) = dense.matches_unit(unit) { + epsilon_closure( + nfa, + next, + builder.look_have(), stack, &mut sparses.set2, ); @@ -250,11 +313,15 @@ pub(crate) fn next( // if one was detected once it enters a quit state (and indeed, the search // routines in this crate do just that), but it seems better to prevent // these things by construction if possible.) - if nfa.has_word_boundary() - && unit.is_word_byte() - && !sparses.set2.is_empty() - { - builder.set_is_from_word(); + if !sparses.set2.is_empty() { + if nfa.look_set_any().contains_word() && unit.is_word_byte() { + builder.set_is_from_word(); + } + if nfa.look_set_any().contains_anchor_crlf() + && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r'))) + { + builder.set_is_half_crlf(); + } } let mut builder_nfa = builder.into_nfa(); add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); @@ -303,8 +370,9 @@ pub(crate) fn epsilon_closure( break; } match *nfa.state(id) { - thompson::State::Range { .. } + thompson::State::ByteRange { .. } | thompson::State::Sparse { .. } + | thompson::State::Dense { .. } | thompson::State::Fail | thompson::State::Match { .. } => break, thompson::State::Look { look, next } => { @@ -323,6 +391,10 @@ pub(crate) fn epsilon_closure( // to the top of the stack. stack.extend(alternates[1..].iter().rev()); } + thompson::State::BinaryUnion { alt1, alt2 } => { + id = alt1; + stack.push(alt2); + } thompson::State::Capture { next, .. } => { id = next; } @@ -336,15 +408,15 @@ pub(crate) fn epsilon_closure( /// were added to `set`. /// /// The DFA builder state given should already have its complete set of match -/// pattern IDs added (if any) and any look-behind assertions (StartLine, -/// StartText and whether this state is being generated for a transition over a -/// word byte when applicable) that are true immediately prior to transitioning -/// into this state (via `builder.look_have()`). The match pattern IDs should -/// correspond to matches that occured on the previous transition, since all -/// matches are delayed by one byte. The things that should _not_ be set are -/// look-ahead assertions (EndLine, EndText and whether the next byte is a -/// word byte or not). The builder state should also not have anything in -/// `look_need` set, as this routine will compute that for you. +/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start +/// and whether this state is being generated for a transition over a word byte +/// when applicable) that are true immediately prior to transitioning into this +/// state (via `builder.look_have()`). The match pattern IDs should correspond +/// to matches that occurred on the previous transition, since all matches are +/// delayed by one byte. The things that should _not_ be set are look-ahead +/// assertions (EndLF, End and whether the next byte is a word byte or not). +/// The builder state should also not have anything in `look_need` set, as this +/// routine will compute that for you. /// /// The given NFA should be able to resolve all identifiers in `set` to a /// particular NFA state. Additionally, `set` must have capacity equivalent @@ -354,56 +426,114 @@ pub(crate) fn add_nfa_states( set: &SparseSet, builder: &mut StateBuilderNFA, ) { - for nfa_id in set { + for nfa_id in set.iter() { match *nfa.state(nfa_id) { - thompson::State::Range { .. } => { + thompson::State::ByteRange { .. } => { builder.add_nfa_state_id(nfa_id); } thompson::State::Sparse { .. } => { builder.add_nfa_state_id(nfa_id); } + thompson::State::Dense { .. } => { + builder.add_nfa_state_id(nfa_id); + } thompson::State::Look { look, .. } => { builder.add_nfa_state_id(nfa_id); - builder.look_need().insert(look); + builder.set_look_need(|need| need.insert(look)); } thompson::State::Union { .. } - | thompson::State::Capture { .. } => { - // Pure epsilon transitions don't need to be tracked - // as part of the DFA state. Tracking them is actually - // superfluous; they won't cause any harm other than making - // determinization slower. + | thompson::State::BinaryUnion { .. } => { + // Pure epsilon transitions don't need to be tracked as part + // of the DFA state. Tracking them is actually superfluous; + // they won't cause any harm other than making determinization + // slower. // // Why aren't these needed? Well, in an NFA, epsilon - // transitions are really just jumping points to other - // states. So once you hit an epsilon transition, the same - // set of resulting states always appears. Therefore, - // putting them in a DFA's set of ordered NFA states is - // strictly redundant. + // transitions are really just jumping points to other states. + // So once you hit an epsilon transition, the same set of + // resulting states always appears. Therefore, putting them in + // a DFA's set of ordered NFA states is strictly redundant. // // Look-around states are also epsilon transitions, but // they are *conditional*. So their presence could be // discriminatory, and thus, they are tracked above. // - // But wait... why are epsilon states in our `set` in the - // first place? Why not just leave them out? They're in - // our `set` because it was generated by computing an - // epsilon closure, and we want to keep track of all states - // we visited to avoid re-visiting them. In exchange, we - // have to do this second iteration over our collected - // states to finalize our DFA state. + // But wait... why are epsilon states in our `set` in the first + // place? Why not just leave them out? They're in our `set` + // because it was generated by computing an epsilon closure, + // and we want to keep track of all states we visited to avoid + // re-visiting them. In exchange, we have to do this second + // iteration over our collected states to finalize our DFA + // state. In theory, we could avoid this second iteration if + // we maintained two sets during epsilon closure: the set of + // visited states (to avoid cycles) and the set of states that + // will actually be used to construct the next DFA state. + // + // Note that this optimization requires that we re-compute the + // epsilon closure to account for look-ahead in 'next' *only + // when necessary*. Namely, only when the set of look-around + // assertions changes and only when those changes are within + // the set of assertions that are needed in order to step + // through the closure correctly. Otherwise, if we re-do the + // epsilon closure needlessly, it could change based on the + // fact that we are omitting epsilon states here. + // + // ----- + // + // Welp, scratch the above. It turns out that recording these + // is in fact necessary to seemingly handle one particularly + // annoying case: when a conditional epsilon transition is + // put inside of a repetition operator. One specific case I + // ran into was the regex `(?:\b|%)+` on the haystack `z%`. + // The correct leftmost first matches are: [0, 0] and [1, 1]. + // But the DFA was reporting [0, 0] and [1, 2]. To understand + // why this happens, consider the NFA for the aforementioned + // regex: // - // Note that this optimization requires that we re-compute - // the epsilon closure to account for look-ahead in 'next' - // *only when necessary*. Namely, only when the set of - // look-around assertions changes and only when those - // changes are within the set of assertions that are - // needed in order to step through the closure correctly. - // Otherwise, if we re-do the epsilon closure needlessly, - // it could change based on the fact that we are omitting - // epsilon states here. + // >000000: binary-union(4, 1) + // 000001: \x00-\xFF => 0 + // 000002: WordAscii => 5 + // 000003: % => 5 + // ^000004: binary-union(2, 3) + // 000005: binary-union(4, 6) + // 000006: MATCH(0) + // + // The problem here is that one of the DFA start states is + // going to consist of the NFA states [2, 3] by computing the + // epsilon closure of state 4. State 4 isn't included because + // we previously were not keeping track of union states. But + // only a subset of transitions out of this state will be able + // to follow WordAscii, and in those cases, the epsilon closure + // is redone. The only problem is that computing the epsilon + // closure from [2, 3] is different than computing the epsilon + // closure from [4]. In the former case, assuming the WordAscii + // assertion is satisfied, you get: [2, 3, 6]. In the latter + // case, you get: [2, 6, 3]. Notice that '6' is the match state + // and appears AFTER '3' in the former case. This leads to a + // preferential but incorrect match of '%' before returning + // a match. In the latter case, the match is preferred over + // continuing to accept the '%'. + // + // It almost feels like we might be able to fix the NFA states + // to avoid this, or to at least only keep track of union + // states where this actually matters, since in the vast + // majority of cases, this doesn't matter. + // + // Another alternative would be to define a new HIR property + // called "assertion is repeated anywhere" and compute it + // inductively over the entire pattern. If it happens anywhere, + // which is probably pretty rare, then we record union states. + // Otherwise we don't. + builder.add_nfa_state_id(nfa_id); } + // Capture states we definitely do not need to record, since they + // are unconditional epsilon transitions with no branching. + thompson::State::Capture { .. } => {} + // It's not totally clear whether we need to record fail states or + // not, but we do so out of an abundance of caution. Since they are + // quite rare in practice, there isn't much cost to recording them. thompson::State::Fail => { - break; + builder.add_nfa_state_id(nfa_id); } thompson::State::Match { .. } => { // Normally, the NFA match state doesn't actually need to @@ -420,74 +550,61 @@ pub(crate) fn add_nfa_states( // there's no reason to track which look-around assertions were // satisfied when this state was created. if builder.look_need().is_empty() { - builder.look_have().clear(); + builder.set_look_have(|_| LookSet::empty()); } } /// Sets the appropriate look-behind assertions on the given state based on /// this starting configuration. pub(crate) fn set_lookbehind_from_start( + nfa: &thompson::NFA, start: &Start, builder: &mut StateBuilderMatches, ) { + let rev = nfa.is_reverse(); + let lineterm = nfa.look_matcher().get_line_terminator(); match *start { Start::NonWordByte => {} Start::WordByte => { builder.set_is_from_word(); } Start::Text => { - builder.look_have().insert(Look::StartText); - builder.look_have().insert(Look::StartLine); + builder.set_look_have(|have| { + have.insert(Look::Start) + .insert(Look::StartLF) + .insert(Look::StartCRLF) + }); } - Start::Line => { - builder.look_have().insert(Look::StartLine); + Start::LineLF => { + if rev { + builder.set_is_half_crlf(); + builder.set_look_have(|have| have.insert(Look::StartLF)); + } else { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } + if lineterm == b'\n' { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + } + Start::LineCR => { + if rev { + builder.set_look_have(|have| have.insert(Look::StartCRLF)); + } else { + builder.set_is_half_crlf(); + } + if lineterm == b'\r' { + builder.set_look_have(|have| have.insert(Look::StartLF)); + } + } + Start::CustomLineTerminator => { + builder.set_look_have(|have| have.insert(Look::StartLF)); + // This is a bit of a tricky case, but if the line terminator was + // set to a word byte, then we also need to behave as if the start + // configuration is Start::WordByte. That is, we need to mark our + // state as having come from a word byte. + if utf8::is_word_byte(lineterm) { + builder.set_is_from_word(); + } } - } -} - -#[cfg(test)] -mod tests { - use super::Start; - - #[test] - #[should_panic] - fn start_fwd_bad_range() { - Start::from_position_fwd(&[], 0, 1); - } - - #[test] - #[should_panic] - fn start_rev_bad_range() { - Start::from_position_rev(&[], 0, 1); - } - - #[test] - fn start_fwd() { - let f = Start::from_position_fwd; - - assert_eq!(Start::Text, f(&[], 0, 0)); - assert_eq!(Start::Text, f(b"abc", 0, 3)); - assert_eq!(Start::Text, f(b"\nabc", 0, 3)); - - assert_eq!(Start::Line, f(b"\nabc", 1, 3)); - - assert_eq!(Start::WordByte, f(b"abc", 1, 3)); - - assert_eq!(Start::NonWordByte, f(b" abc", 1, 3)); - } - - #[test] - fn start_rev() { - let f = Start::from_position_rev; - - assert_eq!(Start::Text, f(&[], 0, 0)); - assert_eq!(Start::Text, f(b"abc", 0, 3)); - assert_eq!(Start::Text, f(b"abc\n", 0, 4)); - - assert_eq!(Start::Line, f(b"abc\nz", 0, 3)); - - assert_eq!(Start::WordByte, f(b"abc", 0, 2)); - - assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3)); } } |