1 files changed, 253 insertions, 136 deletions
diff --git a/vendor/regex-automata/src/util/determinize/mod.rs b/vendor/regex-automata/src/util/determinize/mod.rs
index b384de8e1..30a82afb8 100644
--- a/vendor/regex-automata/src/util/determinize/mod.rs
+++ b/vendor/regex-automata/src/util/determinize/mod.rs
@@ -13,11 +13,9 @@ in common, as defined by this module:
 word boundaries, line boundaries, etc., is all the same. This also includes
 the look-behind assertions that are satisfied by each starting state
 classification.
-
 * The representation of DFA states as sets of NFA states, including
 convenience types for building these DFA states that are amenable to reusing
 allocations.
-
 * Routines for the "classical" parts of determinization: computing the
 epsilon closure, tracking match states (with corresponding pattern IDs, since
 we support multi-pattern finite automata) and, of course, computing the
@@ -44,19 +42,21 @@ pub(crate) use self::state::{
 };
 
 use crate::{
-    nfa::thompson::{self, Look, LookSet},
+    nfa::thompson,
     util::{
         alphabet,
-        id::StateID,
-        matchtypes::MatchKind,
+        look::{Look, LookSet},
+        primitives::StateID,
+        search::MatchKind,
         sparse_set::{SparseSet, SparseSets},
         start::Start,
+        utf8,
     },
 };
 
 mod state;
 
-/// Compute the set of all eachable NFA states, including the full epsilon
+/// Compute the set of all reachable NFA states, including the full epsilon
 /// closure, from a DFA state for a single unit of input. The set of reachable
 /// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
 /// also includes any look-behind assertions satisfied by `unit`, in addition
@@ -100,6 +100,15 @@ pub(crate) fn next(
 ) -> StateBuilderNFA {
     sparses.clear();
 
+    // Whether the NFA is matched in reverse or not. We use this in some
+    // conditional logic for dealing with the exceptionally annoying CRLF-aware
+    // line anchors.
+    let rev = nfa.is_reverse();
+    // The look-around matcher that our NFA is configured with. We don't
+    // actually use it to match look-around assertions, but we do need its
+    // configuration for constructing states consistent with how it matches.
+    let lookm = nfa.look_matcher();
+
     // Put the NFA state IDs into a sparse set in case we need to
     // re-compute their epsilon closure.
     //
@@ -113,43 +122,66 @@ pub(crate) fn next(
         sparses.set1.insert(nfa_id);
     });
 
-    // Compute look-ahead assertions originating from the current state.
-    // Based on the input unit we're transitioning over, some additional
-    // set of assertions may be true. Thus, we re-compute this state's
-    // epsilon closure (but only if necessary).
+    // Compute look-ahead assertions originating from the current state. Based
+    // on the input unit we're transitioning over, some additional set of
+    // assertions may be true. Thus, we re-compute this state's epsilon closure
+    // (but only if necessary). Notably, when we build a DFA state initially,
+    // we don't enable any look-ahead assertions because we don't know whether
+    // they're true or not at that point.
     if !state.look_need().is_empty() {
         // Add look-ahead assertions that are now true based on the current
         // input unit.
         let mut look_have = state.look_have().clone();
         match unit.as_u8() {
+            Some(b'\r') => {
+                if !rev || !state.is_half_crlf() {
+                    look_have = look_have.insert(Look::EndCRLF);
+                }
+            }
             Some(b'\n') => {
-                look_have.insert(Look::EndLine);
+                if rev || !state.is_half_crlf() {
+                    look_have = look_have.insert(Look::EndCRLF);
+                }
             }
             Some(_) => {}
             None => {
-                look_have.insert(Look::EndText);
-                look_have.insert(Look::EndLine);
+                look_have = look_have.insert(Look::End);
+                look_have = look_have.insert(Look::EndLF);
+                look_have = look_have.insert(Look::EndCRLF);
             }
         }
+        if unit.is_byte(lookm.get_line_terminator()) {
+            look_have = look_have.insert(Look::EndLF);
+        }
+        if state.is_half_crlf()
+            && ((rev && !unit.is_byte(b'\r'))
+                || (!rev && !unit.is_byte(b'\n')))
+        {
+            look_have = look_have.insert(Look::StartCRLF);
+        }
         if state.is_from_word() == unit.is_word_byte() {
-            look_have.insert(Look::WordBoundaryUnicodeNegate);
-            look_have.insert(Look::WordBoundaryAsciiNegate);
+            look_have = look_have.insert(Look::WordUnicodeNegate);
+            look_have = look_have.insert(Look::WordAsciiNegate);
         } else {
-            look_have.insert(Look::WordBoundaryUnicode);
-            look_have.insert(Look::WordBoundaryAscii);
+            look_have = look_have.insert(Look::WordUnicode);
+            look_have = look_have.insert(Look::WordAscii);
         }
         // If we have new assertions satisfied that are among the set of
-        // assertions that exist in this state (that is, just because
-        // we added an EndLine assertion above doesn't mean there is an
-        // EndLine conditional epsilon transition in this state), then we
-        // re-compute this state's epsilon closure using the updated set of
-        // assertions.
+        // assertions that exist in this state (that is, just because we added
+        // an EndLF assertion above doesn't mean there is an EndLF conditional
+        // epsilon transition in this state), then we re-compute this state's
+        // epsilon closure using the updated set of assertions.
+        //
+        // Note that since our DFA states omit unconditional epsilon
+        // transitions, this check is necessary for correctness. If we re-did
+        // the epsilon closure below needlessly, it could change based on the
+        // fact that we omitted epsilon states originally.
         if !look_have
             .subtract(state.look_have())
             .intersect(state.look_need())
             .is_empty()
         {
-            for nfa_id in &sparses.set1 {
+            for nfa_id in sparses.set1.iter() {
                 epsilon_closure(
                     nfa,
                     nfa_id,
@@ -166,24 +198,36 @@ pub(crate) fn next(
     // Convert our empty builder into one that can record assertions and match
     // pattern IDs.
     let mut builder = empty_builder.into_matches();
-    // Set whether the StartLine look-behind assertion is true for this
+    // Set whether the StartLF look-behind assertion is true for this
     // transition or not. The look-behind assertion for ASCII word boundaries
     // is handled below.
-    if nfa.has_any_anchor() {
-        if unit.as_u8().map_or(false, |b| b == b'\n') {
-            // Why only handle StartLine here and not StartText? That's
-            // because StartText can only impact the starting state, which
-            // is speical cased in start state handling.
-            builder.look_have().insert(Look::StartLine);
-        }
+    if nfa.look_set_any().contains_anchor_line()
+        && unit.is_byte(lookm.get_line_terminator())
+    {
+        // Why only handle StartLF here and not Start? That's because Start
+        // can only impact the starting state, which is special cased in
+        // start state handling.
+        builder.set_look_have(|have| have.insert(Look::StartLF));
+    }
+    // We also need to add StartCRLF to our assertions too, if we can. This
+    // is unfortunately a bit more complicated, because it depends on the
+    // direction of the search. In the forward direction, ^ matches after a
+    // \n, but in the reverse direction, ^ only matches after a \r. (This is
+    // further complicated by the fact that reverse a regex means changing a ^
+    // to a $ and vice versa.)
+    if nfa.look_set_any().contains_anchor_crlf()
+        && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n')))
+    {
+        builder.set_look_have(|have| have.insert(Look::StartCRLF));
     }
-    for nfa_id in &sparses.set1 {
+    for nfa_id in sparses.set1.iter() {
         match *nfa.state(nfa_id) {
             thompson::State::Union { .. }
+            | thompson::State::BinaryUnion { .. }
             | thompson::State::Fail
             | thompson::State::Look { .. }
             | thompson::State::Capture { .. } => {}
-            thompson::State::Match { id } => {
+            thompson::State::Match { pattern_id } => {
                 // Notice here that we are calling the NEW state a match
                 // state if the OLD state we are transitioning from
                 // contains an NFA match state. This is precisely how we
@@ -204,17 +248,25 @@ pub(crate) fn next(
                 // IDs in a set, we are guarateed not to have any duplicative
                 // match states. Thus, it is impossible to add the same pattern
                 // ID more than once.
-                builder.add_match_pattern_id(id);
+                //
+                // N.B. We delay matches by 1 byte as a way to hack 1-byte
+                // look-around into DFA searches. This lets us support ^, $
+                // and ASCII-only \b. The delay is also why we need a special
+                // "end-of-input" (EOI) sentinel and why we need to follow the
+                // EOI sentinel at the end of every search. This final EOI
+                // transition is necessary to report matches found at the end
+                // of a haystack.
+                builder.add_match_pattern_id(pattern_id);
                 if !match_kind.continue_past_first_match() {
                     break;
                 }
             }
-            thompson::State::Range { range: ref r } => {
-                if r.matches_unit(unit) {
+            thompson::State::ByteRange { ref trans } => {
+                if trans.matches_unit(unit) {
                     epsilon_closure(
                         nfa,
-                        r.next,
-                        *builder.look_have(),
+                        trans.next,
+                        builder.look_have(),
                         stack,
                         &mut sparses.set2,
                     );
@@ -225,7 +277,18 @@ pub(crate) fn next(
                     epsilon_closure(
                         nfa,
                         next,
-                        *builder.look_have(),
+                        builder.look_have(),
+                        stack,
+                        &mut sparses.set2,
+                    );
+                }
+            }
+            thompson::State::Dense(ref dense) => {
+                if let Some(next) = dense.matches_unit(unit) {
+                    epsilon_closure(
+                        nfa,
+                        next,
+                        builder.look_have(),
                         stack,
                         &mut sparses.set2,
                     );
@@ -250,11 +313,15 @@ pub(crate) fn next(
     // if one was detected once it enters a quit state (and indeed, the search
     // routines in this crate do just that), but it seems better to prevent
     // these things by construction if possible.)
-    if nfa.has_word_boundary()
-        && unit.is_word_byte()
-        && !sparses.set2.is_empty()
-    {
-        builder.set_is_from_word();
+    if !sparses.set2.is_empty() {
+        if nfa.look_set_any().contains_word() && unit.is_word_byte() {
+            builder.set_is_from_word();
+        }
+        if nfa.look_set_any().contains_anchor_crlf()
+            && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r')))
+        {
+            builder.set_is_half_crlf();
+        }
     }
     let mut builder_nfa = builder.into_nfa();
     add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
@@ -303,8 +370,9 @@ pub(crate) fn epsilon_closure(
                 break;
             }
             match *nfa.state(id) {
-                thompson::State::Range { .. }
+                thompson::State::ByteRange { .. }
                 | thompson::State::Sparse { .. }
+                | thompson::State::Dense { .. }
                 | thompson::State::Fail
                 | thompson::State::Match { .. } => break,
                 thompson::State::Look { look, next } => {
@@ -323,6 +391,10 @@ pub(crate) fn epsilon_closure(
                     // to the top of the stack.
                     stack.extend(alternates[1..].iter().rev());
                 }
+                thompson::State::BinaryUnion { alt1, alt2 } => {
+                    id = alt1;
+                    stack.push(alt2);
+                }
                 thompson::State::Capture { next, .. } => {
                     id = next;
                 }
@@ -336,15 +408,15 @@ pub(crate) fn epsilon_closure(
 /// were added to `set`.
 ///
 /// The DFA builder state given should already have its complete set of match
-/// pattern IDs added (if any) and any look-behind assertions (StartLine,
-/// StartText and whether this state is being generated for a transition over a
-/// word byte when applicable) that are true immediately prior to transitioning
-/// into this state (via `builder.look_have()`). The match pattern IDs should
-/// correspond to matches that occured on the previous transition, since all
-/// matches are delayed by one byte. The things that should _not_ be set are
-/// look-ahead assertions (EndLine, EndText and whether the next byte is a
-/// word byte or not). The builder state should also not have anything in
-/// `look_need` set, as this routine will compute that for you.
+/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start
+/// and whether this state is being generated for a transition over a word byte
+/// when applicable) that are true immediately prior to transitioning into this
+/// state (via `builder.look_have()`). The match pattern IDs should correspond
+/// to matches that occurred on the previous transition, since all matches are
+/// delayed by one byte. The things that should _not_ be set are look-ahead
+/// assertions (EndLF, End and whether the next byte is a word byte or not).
+/// The builder state should also not have anything in `look_need` set, as this
+/// routine will compute that for you.
 ///
 /// The given NFA should be able to resolve all identifiers in `set` to a
 /// particular NFA state. Additionally, `set` must have capacity equivalent
@@ -354,56 +426,114 @@ pub(crate) fn add_nfa_states(
     set: &SparseSet,
     builder: &mut StateBuilderNFA,
 ) {
-    for nfa_id in set {
+    for nfa_id in set.iter() {
         match *nfa.state(nfa_id) {
-            thompson::State::Range { .. } => {
+            thompson::State::ByteRange { .. } => {
                 builder.add_nfa_state_id(nfa_id);
             }
             thompson::State::Sparse { .. } => {
                 builder.add_nfa_state_id(nfa_id);
             }
+            thompson::State::Dense { .. } => {
+                builder.add_nfa_state_id(nfa_id);
+            }
             thompson::State::Look { look, .. } => {
                 builder.add_nfa_state_id(nfa_id);
-                builder.look_need().insert(look);
+                builder.set_look_need(|need| need.insert(look));
             }
             thompson::State::Union { .. }
-            | thompson::State::Capture { .. } => {
-                // Pure epsilon transitions don't need to be tracked
-                // as part of the DFA state. Tracking them is actually
-                // superfluous; they won't cause any harm other than making
-                // determinization slower.
+            | thompson::State::BinaryUnion { .. } => {
+                // Pure epsilon transitions don't need to be tracked as part
+                // of the DFA state. Tracking them is actually superfluous;
+                // they won't cause any harm other than making determinization
+                // slower.
                 //
                 // Why aren't these needed? Well, in an NFA, epsilon
-                // transitions are really just jumping points to other
-                // states. So once you hit an epsilon transition, the same
-                // set of resulting states always appears. Therefore,
-                // putting them in a DFA's set of ordered NFA states is
-                // strictly redundant.
+                // transitions are really just jumping points to other states.
+                // So once you hit an epsilon transition, the same set of
+                // resulting states always appears. Therefore, putting them in
+                // a DFA's set of ordered NFA states is strictly redundant.
                 //
                 // Look-around states are also epsilon transitions, but
                 // they are *conditional*. So their presence could be
                 // discriminatory, and thus, they are tracked above.
                 //
-                // But wait... why are epsilon states in our `set` in the
-                // first place? Why not just leave them out? They're in
-                // our `set` because it was generated by computing an
-                // epsilon closure, and we want to keep track of all states
-                // we visited to avoid re-visiting them. In exchange, we
-                // have to do this second iteration over our collected
-                // states to finalize our DFA state.
+                // But wait... why are epsilon states in our `set` in the first
+                // place? Why not just leave them out? They're in our `set`
+                // because it was generated by computing an epsilon closure,
+                // and we want to keep track of all states we visited to avoid
+                // re-visiting them. In exchange, we have to do this second
+                // iteration over our collected states to finalize our DFA
+                // state. In theory, we could avoid this second iteration if
+                // we maintained two sets during epsilon closure: the set of
+                // visited states (to avoid cycles) and the set of states that
+                // will actually be used to construct the next DFA state.
+                //
+                // Note that this optimization requires that we re-compute the
+                // epsilon closure to account for look-ahead in 'next' *only
+                // when necessary*. Namely, only when the set of look-around
+                // assertions changes and only when those changes are within
+                // the set of assertions that are needed in order to step
+                // through the closure correctly. Otherwise, if we re-do the
+                // epsilon closure needlessly, it could change based on the
+                // fact that we are omitting epsilon states here.
+                //
+                // -----
+                //
+                // Welp, scratch the above. It turns out that recording these
+                // is in fact necessary to seemingly handle one particularly
+                // annoying case: when a conditional epsilon transition is
+                // put inside of a repetition operator. One specific case I
+                // ran into was the regex `(?:\b|%)+` on the haystack `z%`.
+                // The correct leftmost first matches are: [0, 0] and [1, 1].
+                // But the DFA was reporting [0, 0] and [1, 2]. To understand
+                // why this happens, consider the NFA for the aforementioned
+                // regex:
                 //
-                // Note that this optimization requires that we re-compute
-                // the epsilon closure to account for look-ahead in 'next'
-                // *only when necessary*. Namely, only when the set of
-                // look-around assertions changes and only when those
-                // changes are within the set of assertions that are
-                // needed in order to step through the closure correctly.
-                // Otherwise, if we re-do the epsilon closure needlessly,
-                // it could change based on the fact that we are omitting
-                // epsilon states here.
+                //     >000000: binary-union(4, 1)
+                //      000001: \x00-\xFF => 0
+                //      000002: WordAscii => 5
+                //      000003: % => 5
+                //     ^000004: binary-union(2, 3)
+                //      000005: binary-union(4, 6)
+                //      000006: MATCH(0)
+                //
+                // The problem here is that one of the DFA start states is
+                // going to consist of the NFA states [2, 3] by computing the
+                // epsilon closure of state 4. State 4 isn't included because
+                // we previously were not keeping track of union states. But
+                // only a subset of transitions out of this state will be able
+                // to follow WordAscii, and in those cases, the epsilon closure
+                // is redone. The only problem is that computing the epsilon
+                // closure from [2, 3] is different than computing the epsilon
+                // closure from [4]. In the former case, assuming the WordAscii
+                // assertion is satisfied, you get: [2, 3, 6]. In the latter
+                // case, you get: [2, 6, 3]. Notice that '6' is the match state
+                // and appears AFTER '3' in the former case. This leads to a
+                // preferential but incorrect match of '%' before returning
+                // a match. In the latter case, the match is preferred over
+                // continuing to accept the '%'.
+                //
+                // It almost feels like we might be able to fix the NFA states
+                // to avoid this, or to at least only keep track of union
+                // states where this actually matters, since in the vast
+                // majority of cases, this doesn't matter.
+                //
+                // Another alternative would be to define a new HIR property
+                // called "assertion is repeated anywhere" and compute it
+                // inductively over the entire pattern. If it happens anywhere,
+                // which is probably pretty rare, then we record union states.
+                // Otherwise we don't.
+                builder.add_nfa_state_id(nfa_id);
             }
+            // Capture states we definitely do not need to record, since they
+            // are unconditional epsilon transitions with no branching.
+            thompson::State::Capture { .. } => {}
+            // It's not totally clear whether we need to record fail states or
+            // not, but we do so out of an abundance of caution. Since they are
+            // quite rare in practice, there isn't much cost to recording them.
             thompson::State::Fail => {
-                break;
+                builder.add_nfa_state_id(nfa_id);
             }
             thompson::State::Match { .. } => {
                 // Normally, the NFA match state doesn't actually need to
@@ -420,74 +550,61 @@ pub(crate) fn add_nfa_states(
     // there's no reason to track which look-around assertions were
     // satisfied when this state was created.
     if builder.look_need().is_empty() {
-        builder.look_have().clear();
+        builder.set_look_have(|_| LookSet::empty());
     }
 }
 
 /// Sets the appropriate look-behind assertions on the given state based on
 /// this starting configuration.
 pub(crate) fn set_lookbehind_from_start(
+    nfa: &thompson::NFA,
     start: &Start,
     builder: &mut StateBuilderMatches,
 ) {
+    let rev = nfa.is_reverse();
+    let lineterm = nfa.look_matcher().get_line_terminator();
     match *start {
         Start::NonWordByte => {}
         Start::WordByte => {
             builder.set_is_from_word();
         }
         Start::Text => {
-            builder.look_have().insert(Look::StartText);
-            builder.look_have().insert(Look::StartLine);
+            builder.set_look_have(|have| {
+                have.insert(Look::Start)
+                    .insert(Look::StartLF)
+                    .insert(Look::StartCRLF)
+            });
         }
-        Start::Line => {
-            builder.look_have().insert(Look::StartLine);
+        Start::LineLF => {
+            if rev {
+                builder.set_is_half_crlf();
+                builder.set_look_have(|have| have.insert(Look::StartLF));
+            } else {
+                builder.set_look_have(|have| have.insert(Look::StartCRLF));
+            }
+            if lineterm == b'\n' {
+                builder.set_look_have(|have| have.insert(Look::StartLF));
+            }
+        }
+        Start::LineCR => {
+            if rev {
+                builder.set_look_have(|have| have.insert(Look::StartCRLF));
+            } else {
+                builder.set_is_half_crlf();
+            }
+            if lineterm == b'\r' {
+                builder.set_look_have(|have| have.insert(Look::StartLF));
+            }
+        }
+        Start::CustomLineTerminator => {
+            builder.set_look_have(|have| have.insert(Look::StartLF));
+            // This is a bit of a tricky case, but if the line terminator was
+            // set to a word byte, then we also need to behave as if the start
+            // configuration is Start::WordByte. That is, we need to mark our
+            // state as having come from a word byte.
+            if utf8::is_word_byte(lineterm) {
+                builder.set_is_from_word();
+            }
         }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::Start;
-
-    #[test]
-    #[should_panic]
-    fn start_fwd_bad_range() {
-        Start::from_position_fwd(&[], 0, 1);
-    }
-
-    #[test]
-    #[should_panic]
-    fn start_rev_bad_range() {
-        Start::from_position_rev(&[], 0, 1);
-    }
-
-    #[test]
-    fn start_fwd() {
-        let f = Start::from_position_fwd;
-
-        assert_eq!(Start::Text, f(&[], 0, 0));
-        assert_eq!(Start::Text, f(b"abc", 0, 3));
-        assert_eq!(Start::Text, f(b"\nabc", 0, 3));
-
-        assert_eq!(Start::Line, f(b"\nabc", 1, 3));
-
-        assert_eq!(Start::WordByte, f(b"abc", 1, 3));
-
-        assert_eq!(Start::NonWordByte, f(b" abc", 1, 3));
-    }
-
-    #[test]
-    fn start_rev() {
-        let f = Start::from_position_rev;
-
-        assert_eq!(Start::Text, f(&[], 0, 0));
-        assert_eq!(Start::Text, f(b"abc", 0, 3));
-        assert_eq!(Start::Text, f(b"abc\n", 0, 4));
-
-        assert_eq!(Start::Line, f(b"abc\nz", 0, 3));
-
-        assert_eq!(Start::WordByte, f(b"abc", 0, 2));
-
-        assert_eq!(Start::NonWordByte, f(b"abc ", 0, 3));
     }
 }