Adding upstream version 124.0.1.upstream/124.0.1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
commit: 26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree: f435a8308119effd964b339f76abb83a57c29483 /third_party/rust/regex-automata/src/util/determinize
parent: Initial commit. (diff)
download: firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
2 files changed, 1516 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/util/determinize/mod.rs b/third_party/rust/regex-automata/src/util/determinize/mod.rs
new file mode 100644
index 0000000000..30a82afb81
--- /dev/null
+++ b/third_party/rust/regex-automata/src/util/determinize/mod.rs
@@ -0,0 +1,610 @@
+/*!
+This module contains types and routines for implementing determinization.
+
+In this crate, there are at least two places where we implement
+determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
+lazily compiled DFAs in the `hybrid` module. The stuff in this module
+corresponds to the things that are in common between these implementations.
+
+There are three broad things that our implementations of determinization have
+in common, as defined by this module:
+
+* The classification of start states. That is, whether we're dealing with
+word boundaries, line boundaries, etc., is all the same. This also includes
+the look-behind assertions that are satisfied by each starting state
+classification.
+* The representation of DFA states as sets of NFA states, including
+convenience types for building these DFA states that are amenable to reusing
+allocations.
+* Routines for the "classical" parts of determinization: computing the
+epsilon closure, tracking match states (with corresponding pattern IDs, since
+we support multi-pattern finite automata) and, of course, computing the
+transition function between states for units of input.
+
+I did consider a couple of alternatives to this particular form of code reuse:
+
+1. Don't do any code reuse. The problem here is that we *really* want both
+forms of determinization to do exactly identical things when it comes to
+their handling of NFA states. While our tests generally ensure this, the code
+is tricky and large enough where not reusing code is a pretty big bummer.
+
+2. Implement all of determinization once and make it generic over fully
+compiled DFAs and lazily compiled DFAs. While I didn't actually try this
+approach, my instinct is that it would be more complex than is needed here.
+And the interface required would be pretty hairy. Instead, I think splitting
+it into logical sub-components works better.
+*/
+
+use alloc::vec::Vec;
+
+pub(crate) use self::state::{
+    State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
+};
+
+use crate::{
+    nfa::thompson,
+    util::{
+        alphabet,
+        look::{Look, LookSet},
+        primitives::StateID,
+        search::MatchKind,
+        sparse_set::{SparseSet, SparseSets},
+        start::Start,
+        utf8,
+    },
+};
+
+mod state;
+
+/// Compute the set of all reachable NFA states, including the full epsilon
+/// closure, from a DFA state for a single unit of input. The set of reachable
+/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
+/// also includes any look-behind assertions satisfied by `unit`, in addition
+/// to whether it is a match state. For multi-pattern DFAs, the builder will
+/// also include the pattern IDs that match (in the order seen).
+///
+/// `nfa` must be able to resolve any NFA state in `state` and any NFA state
+/// reachable via the epsilon closure of any NFA state in `state`. `sparses`
+/// must have capacity equivalent to `nfa.len()`.
+///
+/// `match_kind` should correspond to the match semantics implemented by the
+/// DFA being built. Generally speaking, for leftmost-first match semantics,
+/// states that appear after the first NFA match state will not be included in
+/// the `StateBuilderNFA` returned since they are impossible to visit.
+///
+/// `sparses` is used as scratch space for NFA traversal. Other than their
+/// capacity requirements (detailed above), there are no requirements on what's
+/// contained within them (if anything). Similarly, what's inside of them once
+/// this routine returns is unspecified.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+///
+/// `state` corresponds to the current DFA state on which one wants to compute
+/// the transition for the input `unit`.
+///
+/// `empty_builder` corresponds to the builder allocation to use to produce a
+/// complete `StateBuilderNFA` state. If the state is not needed (or is already
+/// cached), then it can be cleared and reused without needing to create a new
+/// `State`. The `StateBuilderNFA` state returned is final and ready to be
+/// turned into a `State` if necessary.
+pub(crate) fn next(
+    nfa: &thompson::NFA,
+    match_kind: MatchKind,
+    sparses: &mut SparseSets,
+    stack: &mut Vec<StateID>,
+    state: &State,
+    unit: alphabet::Unit,
+    empty_builder: StateBuilderEmpty,
+) -> StateBuilderNFA {
+    sparses.clear();
+
+    // Whether the NFA is matched in reverse or not. We use this in some
+    // conditional logic for dealing with the exceptionally annoying CRLF-aware
+    // line anchors.
+    let rev = nfa.is_reverse();
+    // The look-around matcher that our NFA is configured with. We don't
+    // actually use it to match look-around assertions, but we do need its
+    // configuration for constructing states consistent with how it matches.
+    let lookm = nfa.look_matcher();
+
+    // Put the NFA state IDs into a sparse set in case we need to
+    // re-compute their epsilon closure.
+    //
+    // Doing this state shuffling is technically not necessary unless some
+    // kind of look-around is used in the DFA. Some ad hoc experiments
+    // suggested that avoiding this didn't lead to much of an improvement,
+    // but perhaps more rigorous experimentation should be done. And in
+    // particular, avoiding this check requires some light refactoring of
+    // the code below.
+    state.iter_nfa_state_ids(|nfa_id| {
+        sparses.set1.insert(nfa_id);
+    });
+
+    // Compute look-ahead assertions originating from the current state. Based
+    // on the input unit we're transitioning over, some additional set of
+    // assertions may be true. Thus, we re-compute this state's epsilon closure
+    // (but only if necessary). Notably, when we build a DFA state initially,
+    // we don't enable any look-ahead assertions because we don't know whether
+    // they're true or not at that point.
+    if !state.look_need().is_empty() {
+        // Add look-ahead assertions that are now true based on the current
+        // input unit.
+        let mut look_have = state.look_have().clone();
+        match unit.as_u8() {
+            Some(b'\r') => {
+                if !rev || !state.is_half_crlf() {
+                    look_have = look_have.insert(Look::EndCRLF);
+                }
+            }
+            Some(b'\n') => {
+                if rev || !state.is_half_crlf() {
+                    look_have = look_have.insert(Look::EndCRLF);
+                }
+            }
+            Some(_) => {}
+            None => {
+                look_have = look_have.insert(Look::End);
+                look_have = look_have.insert(Look::EndLF);
+                look_have = look_have.insert(Look::EndCRLF);
+            }
+        }
+        if unit.is_byte(lookm.get_line_terminator()) {
+            look_have = look_have.insert(Look::EndLF);
+        }
+        if state.is_half_crlf()
+            && ((rev && !unit.is_byte(b'\r'))
+                || (!rev && !unit.is_byte(b'\n')))
+        {
+            look_have = look_have.insert(Look::StartCRLF);
+        }
+        if state.is_from_word() == unit.is_word_byte() {
+            look_have = look_have.insert(Look::WordUnicodeNegate);
+            look_have = look_have.insert(Look::WordAsciiNegate);
+        } else {
+            look_have = look_have.insert(Look::WordUnicode);
+            look_have = look_have.insert(Look::WordAscii);
+        }
+        // If we have new assertions satisfied that are among the set of
+        // assertions that exist in this state (that is, just because we added
+        // an EndLF assertion above doesn't mean there is an EndLF conditional
+        // epsilon transition in this state), then we re-compute this state's
+        // epsilon closure using the updated set of assertions.
+        //
+        // Note that since our DFA states omit unconditional epsilon
+        // transitions, this check is necessary for correctness. If we re-did
+        // the epsilon closure below needlessly, it could change based on the
+        // fact that we omitted epsilon states originally.
+        if !look_have
+            .subtract(state.look_have())
+            .intersect(state.look_need())
+            .is_empty()
+        {
+            for nfa_id in sparses.set1.iter() {
+                epsilon_closure(
+                    nfa,
+                    nfa_id,
+                    look_have,
+                    stack,
+                    &mut sparses.set2,
+                );
+            }
+            sparses.swap();
+            sparses.set2.clear();
+        }
+    }
+
+    // Convert our empty builder into one that can record assertions and match
+    // pattern IDs.
+    let mut builder = empty_builder.into_matches();
+    // Set whether the StartLF look-behind assertion is true for this
+    // transition or not. The look-behind assertion for ASCII word boundaries
+    // is handled below.
+    if nfa.look_set_any().contains_anchor_line()
+        && unit.is_byte(lookm.get_line_terminator())
+    {
+        // Why only handle StartLF here and not Start? That's because Start
+        // can only impact the starting state, which is special cased in
+        // start state handling.
+        builder.set_look_have(|have| have.insert(Look::StartLF));
+    }
+    // We also need to add StartCRLF to our assertions too, if we can. This
+    // is unfortunately a bit more complicated, because it depends on the
+    // direction of the search. In the forward direction, ^ matches after a
+    // \n, but in the reverse direction, ^ only matches after a \r. (This is
+    // further complicated by the fact that reverse a regex means changing a ^
+    // to a $ and vice versa.)
+    if nfa.look_set_any().contains_anchor_crlf()
+        && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n')))
+    {
+        builder.set_look_have(|have| have.insert(Look::StartCRLF));
+    }
+    for nfa_id in sparses.set1.iter() {
+        match *nfa.state(nfa_id) {
+            thompson::State::Union { .. }
+            | thompson::State::BinaryUnion { .. }
+            | thompson::State::Fail
+            | thompson::State::Look { .. }
+            | thompson::State::Capture { .. } => {}
+            thompson::State::Match { pattern_id } => {
+                // Notice here that we are calling the NEW state a match
+                // state if the OLD state we are transitioning from
+                // contains an NFA match state. This is precisely how we
+                // delay all matches by one byte and also what therefore
+                // guarantees that starting states cannot be match states.
+                //
+                // If we didn't delay matches by one byte, then whether
+                // a DFA is a matching state or not would be determined
+                // by whether one of its own constituent NFA states
+                // was a match state. (And that would be done in
+                // 'add_nfa_states'.)
+                //
+                // Also, 'add_match_pattern_id' requires that callers never
+                // pass duplicative pattern IDs. We do in fact uphold that
+                // guarantee here, but it's subtle. In particular, a Thompson
+                // NFA guarantees that each pattern has exactly one match
+                // state. Moreover, since we're iterating over the NFA state
+                // IDs in a set, we are guarateed not to have any duplicative
+                // match states. Thus, it is impossible to add the same pattern
+                // ID more than once.
+                //
+                // N.B. We delay matches by 1 byte as a way to hack 1-byte
+                // look-around into DFA searches. This lets us support ^, $
+                // and ASCII-only \b. The delay is also why we need a special
+                // "end-of-input" (EOI) sentinel and why we need to follow the
+                // EOI sentinel at the end of every search. This final EOI
+                // transition is necessary to report matches found at the end
+                // of a haystack.
+                builder.add_match_pattern_id(pattern_id);
+                if !match_kind.continue_past_first_match() {
+                    break;
+                }
+            }
+            thompson::State::ByteRange { ref trans } => {
+                if trans.matches_unit(unit) {
+                    epsilon_closure(
+                        nfa,
+                        trans.next,
+                        builder.look_have(),
+                        stack,
+                        &mut sparses.set2,
+                    );
+                }
+            }
+            thompson::State::Sparse(ref sparse) => {
+                if let Some(next) = sparse.matches_unit(unit) {
+                    epsilon_closure(
+                        nfa,
+                        next,
+                        builder.look_have(),
+                        stack,
+                        &mut sparses.set2,
+                    );
+                }
+            }
+            thompson::State::Dense(ref dense) => {
+                if let Some(next) = dense.matches_unit(unit) {
+                    epsilon_closure(
+                        nfa,
+                        next,
+                        builder.look_have(),
+                        stack,
+                        &mut sparses.set2,
+                    );
+                }
+            }
+        }
+    }
+    // We only set the word byte if there's a word boundary look-around
+    // anywhere in this regex. Otherwise, there's no point in bloating the
+    // number of states if we don't have one.
+    //
+    // We also only set it when the state has a non-zero number of NFA states.
+    // Otherwise, we could wind up with states that *should* be DEAD states
+    // but are otherwise distinct from DEAD states because of this look-behind
+    // assertion being set. While this can't technically impact correctness *in
+    // theory*, it can create pathological DFAs that consume input until EOI or
+    // a quit byte is seen. Consuming until EOI isn't a correctness problem,
+    // but a (serious) perf problem. Hitting a quit byte, however, could be a
+    // correctness problem since it could cause search routines to report an
+    // error instead of a detected match once the quit state is entered. (The
+    // search routine could be made to be a bit smarter by reporting a match
+    // if one was detected once it enters a quit state (and indeed, the search
+    // routines in this crate do just that), but it seems better to prevent
+    // these things by construction if possible.)
+    if !sparses.set2.is_empty() {
+        if nfa.look_set_any().contains_word() && unit.is_word_byte() {
+            builder.set_is_from_word();
+        }
+        if nfa.look_set_any().contains_anchor_crlf()
+            && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r')))
+        {
+            builder.set_is_half_crlf();
+        }
+    }
+    let mut builder_nfa = builder.into_nfa();
+    add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
+    builder_nfa
+}
+
+/// Compute the epsilon closure for the given NFA state. The epsilon closure
+/// consists of all NFA state IDs, including `start_nfa_id`, that can be
+/// reached from `start_nfa_id` without consuming any input. These state IDs
+/// are written to `set` in the order they are visited, but only if they are
+/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
+/// given.
+///
+/// `look_have` consists of the satisfied assertions at the current
+/// position. For conditional look-around epsilon transitions, these are
+/// only followed if they are satisfied by `look_have`.
+///
+/// `stack` must have length 0. It is used as scratch space for depth first
+/// traversal. After returning, it is guaranteed that `stack` will have length
+/// 0.
+pub(crate) fn epsilon_closure(
+    nfa: &thompson::NFA,
+    start_nfa_id: StateID,
+    look_have: LookSet,
+    stack: &mut Vec<StateID>,
+    set: &mut SparseSet,
+) {
+    assert!(stack.is_empty());
+    // If this isn't an epsilon state, then the epsilon closure is always just
+    // itself, so there's no need to spin up the machinery below to handle it.
+    if !nfa.state(start_nfa_id).is_epsilon() {
+        set.insert(start_nfa_id);
+        return;
+    }
+
+    stack.push(start_nfa_id);
+    while let Some(mut id) = stack.pop() {
+        // In many cases, we can avoid stack operations when an NFA state only
+        // adds one new state to visit. In that case, we just set our ID to
+        // that state and mush on. We only use the stack when an NFA state
+        // introduces multiple new states to visit.
+        loop {
+            // Insert this NFA state, and if it's already in the set and thus
+            // already visited, then we can move on to the next one.
+            if !set.insert(id) {
+                break;
+            }
+            match *nfa.state(id) {
+                thompson::State::ByteRange { .. }
+                | thompson::State::Sparse { .. }
+                | thompson::State::Dense { .. }
+                | thompson::State::Fail
+                | thompson::State::Match { .. } => break,
+                thompson::State::Look { look, next } => {
+                    if !look_have.contains(look) {
+                        break;
+                    }
+                    id = next;
+                }
+                thompson::State::Union { ref alternates } => {
+                    id = match alternates.get(0) {
+                        None => break,
+                        Some(&id) => id,
+                    };
+                    // We need to process our alternates in order to preserve
+                    // match preferences, so put the earliest alternates closer
+                    // to the top of the stack.
+                    stack.extend(alternates[1..].iter().rev());
+                }
+                thompson::State::BinaryUnion { alt1, alt2 } => {
+                    id = alt1;
+                    stack.push(alt2);
+                }
+                thompson::State::Capture { next, .. } => {
+                    id = next;
+                }
+            }
+        }
+    }
+}
+
+/// Add the NFA state IDs in the given `set` to the given DFA builder state.
+/// The order in which states are added corresponds to the order in which they
+/// were added to `set`.
+///
+/// The DFA builder state given should already have its complete set of match
+/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start
+/// and whether this state is being generated for a transition over a word byte
+/// when applicable) that are true immediately prior to transitioning into this
+/// state (via `builder.look_have()`). The match pattern IDs should correspond
+/// to matches that occurred on the previous transition, since all matches are
+/// delayed by one byte. The things that should _not_ be set are look-ahead
+/// assertions (EndLF, End and whether the next byte is a word byte or not).
+/// The builder state should also not have anything in `look_need` set, as this
+/// routine will compute that for you.
+///
+/// The given NFA should be able to resolve all identifiers in `set` to a
+/// particular NFA state. Additionally, `set` must have capacity equivalent
+/// to `nfa.len()`.
+pub(crate) fn add_nfa_states(
+    nfa: &thompson::NFA,
+    set: &SparseSet,
+    builder: &mut StateBuilderNFA,
+) {
+    for nfa_id in set.iter() {
+        match *nfa.state(nfa_id) {
+            thompson::State::ByteRange { .. } => {
+                builder.add_nfa_state_id(nfa_id);
+            }
+            thompson::State::Sparse { .. } => {
+                builder.add_nfa_state_id(nfa_id);
+            }
+            thompson::State::Dense { .. } => {
+                builder.add_nfa_state_id(nfa_id);
+            }
+            thompson::State::Look { look, .. } => {
+                builder.add_nfa_state_id(nfa_id);
+                builder.set_look_need(|need| need.insert(look));
+            }
+            thompson::State::Union { .. }
+            | thompson::State::BinaryUnion { .. } => {
+                // Pure epsilon transitions don't need to be tracked as part
+                // of the DFA state. Tracking them is actually superfluous;
+                // they won't cause any harm other than making determinization
+                // slower.
+                //
+                // Why aren't these needed? Well, in an NFA, epsilon
+                // transitions are really just jumping points to other states.
+                // So once you hit an epsilon transition, the same set of
+                // resulting states always appears. Therefore, putting them in
+                // a DFA's set of ordered NFA states is strictly redundant.
+                //
+                // Look-around states are also epsilon transitions, but
+                // they are *conditional*. So their presence could be
+                // discriminatory, and thus, they are tracked above.
+                //
+                // But wait... why are epsilon states in our `set` in the first
+                // place? Why not just leave them out? They're in our `set`
+                // because it was generated by computing an epsilon closure,
+                // and we want to keep track of all states we visited to avoid
+                // re-visiting them. In exchange, we have to do this second
+                // iteration over our collected states to finalize our DFA
+                // state. In theory, we could avoid this second iteration if
+                // we maintained two sets during epsilon closure: the set of
+                // visited states (to avoid cycles) and the set of states that
+                // will actually be used to construct the next DFA state.
+                //
+                // Note that this optimization requires that we re-compute the
+                // epsilon closure to account for look-ahead in 'next' *only
+                // when necessary*. Namely, only when the set of look-around
+                // assertions changes and only when those changes are within
+                // the set of assertions that are needed in order to step
+                // through the closure correctly. Otherwise, if we re-do the
+                // epsilon closure needlessly, it could change based on the
+                // fact that we are omitting epsilon states here.
+                //
+                // -----
+                //
+                // Welp, scratch the above. It turns out that recording these
+                // is in fact necessary to seemingly handle one particularly
+                // annoying case: when a conditional epsilon transition is
+                // put inside of a repetition operator. One specific case I
+                // ran into was the regex `(?:\b|%)+` on the haystack `z%`.
+                // The correct leftmost first matches are: [0, 0] and [1, 1].
+                // But the DFA was reporting [0, 0] and [1, 2]. To understand
+                // why this happens, consider the NFA for the aforementioned
+                // regex:
+                //
+                //     >000000: binary-union(4, 1)
+                //      000001: \x00-\xFF => 0
+                //      000002: WordAscii => 5
+                //      000003: % => 5
+                //     ^000004: binary-union(2, 3)
+                //      000005: binary-union(4, 6)
+                //      000006: MATCH(0)
+                //
+                // The problem here is that one of the DFA start states is
+                // going to consist of the NFA states [2, 3] by computing the
+                // epsilon closure of state 4. State 4 isn't included because
+                // we previously were not keeping track of union states. But
+                // only a subset of transitions out of this state will be able
+                // to follow WordAscii, and in those cases, the epsilon closure
+                // is redone. The only problem is that computing the epsilon
+                // closure from [2, 3] is different than computing the epsilon
+                // closure from [4]. In the former case, assuming the WordAscii
+                // assertion is satisfied, you get: [2, 3, 6]. In the latter
+                // case, you get: [2, 6, 3]. Notice that '6' is the match state
+                // and appears AFTER '3' in the former case. This leads to a
+                // preferential but incorrect match of '%' before returning
+                // a match. In the latter case, the match is preferred over
+                // continuing to accept the '%'.
+                //
+                // It almost feels like we might be able to fix the NFA states
+                // to avoid this, or to at least only keep track of union
+                // states where this actually matters, since in the vast
+                // majority of cases, this doesn't matter.
+                //
+                // Another alternative would be to define a new HIR property
+                // called "assertion is repeated anywhere" and compute it
+                // inductively over the entire pattern. If it happens anywhere,
+                // which is probably pretty rare, then we record union states.
+                // Otherwise we don't.
+                builder.add_nfa_state_id(nfa_id);
+            }
+            // Capture states we definitely do not need to record, since they
+            // are unconditional epsilon transitions with no branching.
+            thompson::State::Capture { .. } => {}
+            // It's not totally clear whether we need to record fail states or
+            // not, but we do so out of an abundance of caution. Since they are
+            // quite rare in practice, there isn't much cost to recording them.
+            thompson::State::Fail => {
+                builder.add_nfa_state_id(nfa_id);
+            }
+            thompson::State::Match { .. } => {
+                // Normally, the NFA match state doesn't actually need to
+                // be inside the DFA state. But since we delay matches by
+                // one byte, the matching DFA state corresponds to states
+                // that transition from the one we're building here. And
+                // the way we detect those cases is by looking for an NFA
+                // match state. See 'next' for how this is handled.
+                builder.add_nfa_state_id(nfa_id);
+            }
+        }
+    }
+    // If we know this state contains no look-around assertions, then
+    // there's no reason to track which look-around assertions were
+    // satisfied when this state was created.
+    if builder.look_need().is_empty() {
+        builder.set_look_have(|_| LookSet::empty());
+    }
+}
+
+/// Sets the appropriate look-behind assertions on the given state based on
+/// this starting configuration.
+pub(crate) fn set_lookbehind_from_start(
+    nfa: &thompson::NFA,
+    start: &Start,
+    builder: &mut StateBuilderMatches,
+) {
+    let rev = nfa.is_reverse();
+    let lineterm = nfa.look_matcher().get_line_terminator();
+    match *start {
+        Start::NonWordByte => {}
+        Start::WordByte => {
+            builder.set_is_from_word();
+        }
+        Start::Text => {
+            builder.set_look_have(|have| {
+                have.insert(Look::Start)
+                    .insert(Look::StartLF)
+                    .insert(Look::StartCRLF)
+            });
+        }
+        Start::LineLF => {
+            if rev {
+                builder.set_is_half_crlf();
+                builder.set_look_have(|have| have.insert(Look::StartLF));
+            } else {
+                builder.set_look_have(|have| have.insert(Look::StartCRLF));
+            }
+            if lineterm == b'\n' {
+                builder.set_look_have(|have| have.insert(Look::StartLF));
+            }
+        }
+        Start::LineCR => {
+            if rev {
+                builder.set_look_have(|have| have.insert(Look::StartCRLF));
+            } else {
+                builder.set_is_half_crlf();
+            }
+            if lineterm == b'\r' {
+                builder.set_look_have(|have| have.insert(Look::StartLF));
+            }
+        }
+        Start::CustomLineTerminator => {
+            builder.set_look_have(|have| have.insert(Look::StartLF));
+            // This is a bit of a tricky case, but if the line terminator was
+            // set to a word byte, then we also need to behave as if the start
+            // configuration is Start::WordByte. That is, we need to mark our
+            // state as having come from a word byte.
+            if utf8::is_word_byte(lineterm) {
+                builder.set_is_from_word();
+            }
+        }
+    }
+}
diff --git a/third_party/rust/regex-automata/src/util/determinize/state.rs b/third_party/rust/regex-automata/src/util/determinize/state.rs
new file mode 100644
index 0000000000..e641235874
--- /dev/null
+++ b/third_party/rust/regex-automata/src/util/determinize/state.rs
@@ -0,0 +1,906 @@
+/*!
+This module defines a DFA state representation and builders for constructing
+DFA states.
+
+This representation is specifically for use in implementations of NFA-to-DFA
+conversion via powerset construction. (Also called "determinization" in this
+crate.)
+
+The term "DFA state" is somewhat overloaded in this crate. In some cases, it
+refers to the set of transitions over an alphabet for a particular state. In
+other cases, it refers to a set of NFA states. The former is really about the
+final representation of a state in a DFA's transition table, where as the
+latter---what this module is focused on---is closer to an intermediate form
+that is used to help eventually build the transition table.
+
+This module exports four types. All four types represent the same idea: an
+ordered set of NFA states. This ordered set represents the epsilon closure of a
+particular NFA state, where the "epsilon closure" is the set of NFA states that
+can be transitioned to without consuming any input. i.e., Follow all of the NFA
+state's epsilon transitions. In addition, this implementation of DFA states
+cares about two other things: the ordered set of pattern IDs corresponding
+to the patterns that match if the state is a match state, and the set of
+look-behind assertions that were true when the state was created.
+
+The first, `State`, is a frozen representation of a state that cannot be
+modified. It may be cheaply cloned without copying the state itself and can be
+accessed safely from multiple threads simultaneously. This type is useful for
+when one knows that the DFA state being constructed is distinct from any other
+previously constructed states. Namely, powerset construction, in practice,
+requires one to keep a cache of previously created DFA states. Otherwise,
+the number of DFA states created in memory balloons to an impractically
+large number. For this reason, equivalent states should endeavor to have an
+equivalent byte-level representation. (In general, "equivalency" here means,
+"equivalent assertions, pattern IDs and NFA state IDs." We do not require that
+full DFA minimization be implemented here. This form of equivalency is only
+surface deep and is more-or-less a practical necessity.)
+
+The other three types represent different phases in the construction of a
+DFA state. Internally, these three types (and `State`) all use the same
+byte-oriented representation. That means one can use any of the builder types
+to check whether the state it represents already exists or not. If it does,
+then there is no need to freeze it into a `State` (which requires an alloc and
+a copy). Here are the three types described succinctly:
+
+* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions
+and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
+`StateBuilderEmpty` can only be used to query its underlying memory capacity,
+or to convert into a builder for recording pattern IDs and/or assertions.
+
+* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
+or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
+can only be used for adding pattern IDs and recording assertions.
+
+* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
+more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
+can only be used for adding NFA state IDs and recording some assertions.
+
+The expected flow here is to use the above builders to construct a candidate
+DFA state to check if it already exists. If it does, then there's no need to
+freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state`
+can be called to freeze the builder into an immutable `State`. In either
+case, `clear` should be called on the builder to turn it back into a
+`StateBuilderEmpty` that reuses the underlying memory.
+
+The main purpose for splitting the builder into these distinct types is to
+make it impossible to do things like adding a pattern ID after adding an NFA
+state ID. Namely, this makes it simpler to use a space-and-time efficient
+binary representation for the state. (The format is documented on the `Repr`
+type below.) If we just used one type for everything, it would be possible for
+callers to use an incorrect interleaving of calls and thus result in a corrupt
+representation. I chose to use more type machinery to make this impossible to
+do because 1) determinization is itself pretty complex and it wouldn't be too
+hard to foul this up and 2) there isn't too much machinery involved and it's
+well contained.
+
+As an optimization, sometimes states won't have certain things set. For
+example, if the underlying NFA has no word boundary assertions, then there is
+no reason to set a state's look-behind assertion as to whether it was generated
+from a word byte or not. Similarly, if a state has no NFA states corresponding
+to look-around assertions, then there is no reason to set `look_have` to a
+non-empty set. Finally, callers usually omit unconditional epsilon transitions
+when adding NFA state IDs since they aren't discriminatory.
+
+Finally, the binary representation used by these states is, thankfully, not
+serialized anywhere. So any kind of change can be made with reckless abandon,
+as long as everything in this module agrees.
+*/
+
+use core::{convert::TryFrom, mem};
+
+use alloc::{sync::Arc, vec::Vec};
+
+use crate::util::{
+    int::{I32, U32},
+    look::LookSet,
+    primitives::{PatternID, StateID},
+    wire::{self, Endian},
+};
+
+/// A DFA state that, at its core, is represented by an ordered set of NFA
+/// states.
+///
+/// This type is intended to be used only in NFA-to-DFA conversion via powerset
+/// construction.
+///
+/// It may be cheaply cloned and accessed safely from multiple threads
+/// simultaneously.
+#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
+pub(crate) struct State(Arc<[u8]>);
+
+/// This Borrow impl permits us to lookup any state in a map by its byte
+/// representation. This is particularly convenient when one has a StateBuilder
+/// and we want to see if a correspondingly equivalent state already exists. If
+/// one does exist, then we can reuse the allocation required by StateBuilder
+/// without having to convert it into a State first.
+impl core::borrow::Borrow<[u8]> for State {
+    fn borrow(&self) -> &[u8] {
+        &*self.0
+    }
+}
+
+impl core::fmt::Debug for State {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.debug_tuple("State").field(&self.repr()).finish()
+    }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl State {
+    pub(crate) fn dead() -> State {
+        StateBuilderEmpty::new().into_matches().into_nfa().to_state()
+    }
+
+    pub(crate) fn is_match(&self) -> bool {
+        self.repr().is_match()
+    }
+
+    pub(crate) fn is_from_word(&self) -> bool {
+        self.repr().is_from_word()
+    }
+
+    pub(crate) fn is_half_crlf(&self) -> bool {
+        self.repr().is_half_crlf()
+    }
+
+    pub(crate) fn look_have(&self) -> LookSet {
+        self.repr().look_have()
+    }
+
+    pub(crate) fn look_need(&self) -> LookSet {
+        self.repr().look_need()
+    }
+
+    pub(crate) fn match_len(&self) -> usize {
+        self.repr().match_len()
+    }
+
+    pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
+        self.repr().match_pattern(index)
+    }
+
+    pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
+        self.repr().match_pattern_ids()
+    }
+
+    #[cfg(all(test, not(miri)))]
+    pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
+        self.repr().iter_match_pattern_ids(f)
+    }
+
+    pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) {
+        self.repr().iter_nfa_state_ids(f)
+    }
+
+    pub(crate) fn memory_usage(&self) -> usize {
+        self.0.len()
+    }
+
+    fn repr(&self) -> Repr<'_> {
+        Repr(&*self.0)
+    }
+}
+
+/// A state builder that represents an empty state.
+///
+/// This is a useful "initial condition" for state construction. It has no
+/// NFA state IDs, no assertions set and no pattern IDs. No allocations are
+/// made when new() is called. Its main use is for being converted into a
+/// builder that can capture assertions and pattern IDs.
+#[derive(Clone, Debug)]
+pub(crate) struct StateBuilderEmpty(Vec<u8>);
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderEmpty {
+    pub(crate) fn new() -> StateBuilderEmpty {
+        StateBuilderEmpty(alloc::vec![])
+    }
+
+    pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
+        self.0.extend_from_slice(&[0, 0, 0, 0, 0]);
+        StateBuilderMatches(self.0)
+    }
+
+    fn clear(&mut self) {
+        self.0.clear();
+    }
+
+    pub(crate) fn capacity(&self) -> usize {
+        self.0.capacity()
+    }
+}
+
+/// A state builder that collects assertions and pattern IDs.
+///
+/// When collecting pattern IDs is finished, this can be converted into a
+/// builder that collects NFA state IDs.
+#[derive(Clone)]
+pub(crate) struct StateBuilderMatches(Vec<u8>);
+
+impl core::fmt::Debug for StateBuilderMatches {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish()
+    }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderMatches {
+    pub(crate) fn into_nfa(mut self) -> StateBuilderNFA {
+        self.repr_vec().close_match_pattern_ids();
+        StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
+    }
+
+    pub(crate) fn set_is_from_word(&mut self) {
+        self.repr_vec().set_is_from_word()
+    }
+
+    pub(crate) fn set_is_half_crlf(&mut self) {
+        self.repr_vec().set_is_half_crlf()
+    }
+
+    pub(crate) fn look_have(&self) -> LookSet {
+        LookSet::read_repr(&self.0[1..])
+    }
+
+    pub(crate) fn set_look_have(
+        &mut self,
+        set: impl FnMut(LookSet) -> LookSet,
+    ) {
+        self.repr_vec().set_look_have(set)
+    }
+
+    pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
+        self.repr_vec().add_match_pattern_id(pid)
+    }
+
+    fn repr(&self) -> Repr<'_> {
+        Repr(&self.0)
+    }
+
+    fn repr_vec(&mut self) -> ReprVec<'_> {
+        ReprVec(&mut self.0)
+    }
+}
+
+/// A state builder that collects some assertions and NFA state IDs.
+///
+/// When collecting NFA state IDs is finished, this can be used to build a
+/// `State` if necessary.
+///
+/// When dont with building a state (regardless of whether it got kept or not),
+/// it's usually a good idea to call `clear` to get an empty builder back so
+/// that it can be reused to build the next state.
+#[derive(Clone)]
+pub(crate) struct StateBuilderNFA {
+    repr: Vec<u8>,
+    prev_nfa_state_id: StateID,
+}
+
+impl core::fmt::Debug for StateBuilderNFA {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish()
+    }
+}
+
+/// For docs on these routines, see the internal Repr and ReprVec types below.
+impl StateBuilderNFA {
+    pub(crate) fn to_state(&self) -> State {
+        State(Arc::from(&*self.repr))
+    }
+
+    pub(crate) fn clear(self) -> StateBuilderEmpty {
+        let mut builder = StateBuilderEmpty(self.repr);
+        builder.clear();
+        builder
+    }
+
+    pub(crate) fn look_need(&self) -> LookSet {
+        self.repr().look_need()
+    }
+
+    pub(crate) fn set_look_have(
+        &mut self,
+        set: impl FnMut(LookSet) -> LookSet,
+    ) {
+        self.repr_vec().set_look_have(set)
+    }
+
+    pub(crate) fn set_look_need(
+        &mut self,
+        set: impl FnMut(LookSet) -> LookSet,
+    ) {
+        self.repr_vec().set_look_need(set)
+    }
+
+    pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
+        ReprVec(&mut self.repr)
+            .add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
+    }
+
+    pub(crate) fn as_bytes(&self) -> &[u8] {
+        &self.repr
+    }
+
+    fn repr(&self) -> Repr<'_> {
+        Repr(&self.repr)
+    }
+
+    fn repr_vec(&mut self) -> ReprVec<'_> {
+        ReprVec(&mut self.repr)
+    }
+}
+
+/// Repr is a read-only view into the representation of a DFA state.
+///
+/// Primarily, a Repr is how we achieve DRY: we implement decoding the format
+/// in one place, and then use a Repr to implement the various methods on the
+/// public state types.
+///
+/// The format is as follows:
+///
+/// The first three bytes correspond to bitsets.
+///
+/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the
+/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1
+/// if the state has pattern IDs explicitly written to it. (This is a flag that
+/// is not meant to be set by determinization, but rather, is used as part of
+/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was
+/// generated by a transition over a "word" byte. (Callers may not always set
+/// this. For example, if the NFA has no word boundary assertion, then needing
+/// to track whether a state came from a word byte or not is superfluous and
+/// wasteful.)
+///
+/// Byte 1 corresponds to the look-behind assertions that were satisfied by
+/// the transition that created this state. This generally only includes the
+/// StartLF and Start assertions. (Look-ahead assertions are not tracked as
+/// part of states. Instead, these are applied by re-computing the epsilon
+/// closure of a state when computing the transition function. See `next` in
+/// the parent module.)
+///
+/// Byte 2 corresponds to the set of look-around assertions (including both
+/// look-behind and look-ahead) that appear somewhere in this state's set of
+/// NFA state IDs. This is used to determine whether this state's epsilon
+/// closure should be re-computed when computing the transition function.
+/// Namely, look-around assertions are "just" conditional epsilon transitions,
+/// so if there are new assertions available when computing the transition
+/// function, we should only re-compute the epsilon closure if those new
+/// assertions are relevant to this particular state.
+///
+/// Bytes 3..7 correspond to a 32-bit native-endian encoded integer
+/// corresponding to the number of patterns encoded in this state. If the state
+/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
+/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
+/// offset 3 is the position at which the first NFA state ID is encoded.
+///
+/// For a match state with at least one non-ZERO pattern ID, the next bytes
+/// correspond to a sequence of 32-bit native endian encoded integers that
+/// represent each pattern ID, in order, that this match state represents.
+///
+/// After the pattern IDs (if any), NFA state IDs are delta encoded as
+/// varints.[1] The first NFA state ID is encoded as itself, and each
+/// subsequent NFA state ID is encoded as the difference between itself and the
+/// previous NFA state ID.
+///
+/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints
+struct Repr<'a>(&'a [u8]);
+
+impl<'a> Repr<'a> {
+    /// Returns true if and only if this is a match state.
+    ///
+    /// If callers have added pattern IDs to this state, then callers MUST set
+    /// this state as a match state explicitly. However, as a special case,
+    /// states that are marked as match states but with no pattern IDs, then
+    /// the state is treated as if it had a single pattern ID equivalent to
+    /// PatternID::ZERO.
+    fn is_match(&self) -> bool {
+        self.0[0] & (1 << 0) > 0
+    }
+
+    /// Returns true if and only if this state has had at least one pattern
+    /// ID added to it.
+    ///
+    /// This is an internal-only flag that permits the representation to save
+    /// space in the common case of an NFA with one pattern in it. In that
+    /// case, a match state can only ever have exactly one pattern ID:
+    /// PatternID::ZERO. So there's no need to represent it.
+    fn has_pattern_ids(&self) -> bool {
+        self.0[0] & (1 << 1) > 0
+    }
+
+    /// Returns true if and only if this state is marked as having been created
+    /// from a transition over a word byte. This is useful for checking whether
+    /// a word boundary assertion is true or not, which requires look-behind
+    /// (whether the current state came from a word byte or not) and look-ahead
+    /// (whether the transition byte is a word byte or not).
+    ///
+    /// Since states with this set are distinct from states that don't have
+    /// this set (even if they are otherwise equivalent), callers should not
+    /// set this assertion unless the underlying NFA has at least one word
+    /// boundary assertion somewhere. Otherwise, a superfluous number of states
+    /// may be created.
+    fn is_from_word(&self) -> bool {
+        self.0[0] & (1 << 2) > 0
+    }
+
+    /// Returns true if and only if this state is marked as being inside of a
+    /// CRLF terminator. In the forward direction, this means the state was
+    /// created after seeing a `\r`. In the reverse direction, this means the
+    /// state was created after seeing a `\n`.
+    fn is_half_crlf(&self) -> bool {
+        self.0[0] & (1 << 3) > 0
+    }
+
+    /// The set of look-behind assertions that were true in the transition that
+    /// created this state.
+    ///
+    /// Generally, this should be empty if 'look_need' is empty, since there is
+    /// no reason to track which look-behind assertions are true if the state
+    /// has no conditional epsilon transitions.
+    ///
+    /// Satisfied look-ahead assertions are not tracked in states. Instead,
+    /// these are re-computed on demand via epsilon closure when computing the
+    /// transition function.
+    fn look_have(&self) -> LookSet {
+        LookSet::read_repr(&self.0[1..])
+    }
+
+    /// The set of look-around (both behind and ahead) assertions that appear
+    /// at least once in this state's set of NFA states.
+    ///
+    /// This is used to determine whether the epsilon closure needs to be
+    /// re-computed when computing the transition function. Namely, if the
+    /// state has no conditional epsilon transitions, then there is no need
+    /// to re-compute the epsilon closure.
+    fn look_need(&self) -> LookSet {
+        LookSet::read_repr(&self.0[3..])
+    }
+
+    /// Returns the total number of match pattern IDs in this state.
+    ///
+    /// If this state is not a match state, then this always returns 0.
+    fn match_len(&self) -> usize {
+        if !self.is_match() {
+            return 0;
+        } else if !self.has_pattern_ids() {
+            1
+        } else {
+            self.encoded_pattern_len()
+        }
+    }
+
+    /// Returns the pattern ID for this match state at the given index.
+    ///
+    /// If the given index is greater than or equal to `match_len()` for this
+    /// state, then this could panic or return incorrect results.
+    fn match_pattern(&self, index: usize) -> PatternID {
+        if !self.has_pattern_ids() {
+            PatternID::ZERO
+        } else {
+            let offset = 9 + index * PatternID::SIZE;
+            // This is OK since we only ever serialize valid PatternIDs to
+            // states.
+            wire::read_pattern_id_unchecked(&self.0[offset..]).0
+        }
+    }
+
+    /// Returns a copy of all match pattern IDs in this state. If this state
+    /// is not a match state, then this returns None.
+    fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
+        if !self.is_match() {
+            return None;
+        }
+        let mut pids = alloc::vec![];
+        self.iter_match_pattern_ids(|pid| pids.push(pid));
+        Some(pids)
+    }
+
+    /// Calls the given function on every pattern ID in this state.
+    fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) {
+        if !self.is_match() {
+            return;
+        }
+        // As an optimization for a very common case, when this is a match
+        // state for an NFA with only one pattern, we don't actually write the
+        // pattern ID to the state representation. Instead, we know it must
+        // be there since it is the only possible choice.
+        if !self.has_pattern_ids() {
+            f(PatternID::ZERO);
+            return;
+        }
+        let mut pids = &self.0[9..self.pattern_offset_end()];
+        while !pids.is_empty() {
+            let pid = wire::read_u32(pids);
+            pids = &pids[PatternID::SIZE..];
+            // This is OK since we only ever serialize valid PatternIDs to
+            // states. And since pattern IDs can never exceed a usize, the
+            // unwrap is OK.
+            f(PatternID::new_unchecked(usize::try_from(pid).unwrap()));
+        }
+    }
+
+    /// Calls the given function on every NFA state ID in this state.
+    fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) {
+        let mut sids = &self.0[self.pattern_offset_end()..];
+        let mut prev = 0i32;
+        while !sids.is_empty() {
+            let (delta, nr) = read_vari32(sids);
+            sids = &sids[nr..];
+            let sid = prev + delta;
+            prev = sid;
+            // This is OK since we only ever serialize valid StateIDs to
+            // states. And since state IDs can never exceed an isize, they must
+            // always be able to fit into a usize, and thus cast is OK.
+            f(StateID::new_unchecked(sid.as_usize()))
+        }
+    }
+
+    /// Returns the offset into this state's representation where the pattern
+    /// IDs end and the NFA state IDs begin.
+    fn pattern_offset_end(&self) -> usize {
+        let encoded = self.encoded_pattern_len();
+        if encoded == 0 {
+            return 5;
+        }
+        // This arithmetic is OK since we were able to address this many bytes
+        // when writing to the state, thus, it must fit into a usize.
+        encoded.checked_mul(4).unwrap().checked_add(9).unwrap()
+    }
+
+    /// Returns the total number of *encoded* pattern IDs in this state.
+    ///
+    /// This may return 0 even when this is a match state, since the pattern
+    /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
+    /// the match state (the overwhelming common case).
+    fn encoded_pattern_len(&self) -> usize {
+        if !self.has_pattern_ids() {
+            return 0;
+        }
+        // This unwrap is OK since the total number of patterns is always
+        // guaranteed to fit into a usize.
+        usize::try_from(wire::read_u32(&self.0[5..9])).unwrap()
+    }
+}
+
+impl<'a> core::fmt::Debug for Repr<'a> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        let mut nfa_ids = alloc::vec![];
+        self.iter_nfa_state_ids(|sid| nfa_ids.push(sid));
+        f.debug_struct("Repr")
+            .field("is_match", &self.is_match())
+            .field("is_from_word", &self.is_from_word())
+            .field("is_half_crlf", &self.is_half_crlf())
+            .field("look_have", &self.look_have())
+            .field("look_need", &self.look_need())
+            .field("match_pattern_ids", &self.match_pattern_ids())
+            .field("nfa_state_ids", &nfa_ids)
+            .finish()
+    }
+}
+
+/// ReprVec is a write-only view into the representation of a DFA state.
+///
+/// See Repr for more details on the purpose of this type and also the format.
+///
+/// Note that not all possible combinations of methods may be called. This is
+/// precisely what the various StateBuilder types encapsulate: they only
+/// permit valid combinations via Rust's linear typing.
+struct ReprVec<'a>(&'a mut Vec<u8>);
+
+impl<'a> ReprVec<'a> {
+    /// Set this state as a match state.
+    ///
+    /// This should not be exposed explicitly outside of this module. It is
+    /// set automatically when a pattern ID is added.
+    fn set_is_match(&mut self) {
+        self.0[0] |= 1 << 0;
+    }
+
+    /// Set that this state has pattern IDs explicitly written to it.
+    ///
+    /// This should not be exposed explicitly outside of this module. This is
+    /// used internally as a space saving optimization. Namely, if the state
+    /// is a match state but does not have any pattern IDs written to it,
+    /// then it is automatically inferred to have a pattern ID of ZERO.
+    fn set_has_pattern_ids(&mut self) {
+        self.0[0] |= 1 << 1;
+    }
+
+    /// Set this state as being built from a transition over a word byte.
+    ///
+    /// Setting this is only necessary when one needs to deal with word
+    /// boundary assertions. Therefore, if the underlying NFA has no word
+    /// boundary assertions, callers should not set this.
+    fn set_is_from_word(&mut self) {
+        self.0[0] |= 1 << 2;
+    }
+
+    /// Set this state as having seen half of a CRLF terminator.
+    ///
+    /// In the forward direction, this should be set when a `\r` has been seen.
+    /// In the reverse direction, this should be set when a `\n` has been seen.
+    fn set_is_half_crlf(&mut self) {
+        self.0[0] |= 1 << 3;
+    }
+
+    /// The set of look-behind assertions that were true in the transition that
+    /// created this state.
+    fn look_have(&self) -> LookSet {
+        self.repr().look_have()
+    }
+
+    /// The set of look-around (both behind and ahead) assertions that appear
+    /// at least once in this state's set of NFA states.
+    fn look_need(&self) -> LookSet {
+        self.repr().look_need()
+    }
+
+    /// Mutate the set of look-behind assertions that were true in the
+    /// transition that created this state.
+    fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
+        set(self.look_have()).write_repr(&mut self.0[1..]);
+    }
+
+    /// Mutate the set of look-around (both behind and ahead) assertions that
+    /// appear at least once in this state's set of NFA states.
+    fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
+        set(self.look_need()).write_repr(&mut self.0[3..]);
+    }
+
+    /// Add a pattern ID to this state. All match states must have at least
+    /// one pattern ID associated with it.
+    ///
+    /// Callers must never add duplicative pattern IDs.
+    ///
+    /// The order in which patterns are added must correspond to the order
+    /// in which patterns are reported as matches.
+    fn add_match_pattern_id(&mut self, pid: PatternID) {
+        // As a (somewhat small) space saving optimization, in the case where
+        // a matching state has exactly one pattern ID, PatternID::ZERO, we do
+        // not write either the pattern ID or the number of patterns encoded.
+        // Instead, all we do is set the 'is_match' bit on this state. Overall,
+        // this saves 8 bytes per match state for the overwhelming majority of
+        // match states.
+        //
+        // In order to know whether pattern IDs need to be explicitly read or
+        // not, we use another internal-only bit, 'has_pattern_ids', to
+        // indicate whether they have been explicitly written or not.
+        if !self.repr().has_pattern_ids() {
+            if pid == PatternID::ZERO {
+                self.set_is_match();
+                return;
+            }
+            // Make room for 'close_match_pattern_ids' to write the total
+            // number of pattern IDs written.
+            self.0.extend(core::iter::repeat(0).take(PatternID::SIZE));
+            self.set_has_pattern_ids();
+            // If this was already a match state, then the only way that's
+            // possible when the state doesn't have pattern IDs is if
+            // PatternID::ZERO was added by the caller previously. In this
+            // case, we are now adding a non-ZERO pattern ID after it, in
+            // which case, we want to make sure to represent ZERO explicitly
+            // now.
+            if self.repr().is_match() {
+                write_u32(self.0, 0)
+            } else {
+                // Otherwise, just make sure the 'is_match' bit is set.
+                self.set_is_match();
+            }
+        }
+        write_u32(self.0, pid.as_u32());
+    }
+
+    /// Indicate that no more pattern IDs will be added to this state.
+    ///
+    /// Once this is called, callers must not call it or 'add_match_pattern_id'
+    /// again.
+    ///
+    /// This should not be exposed explicitly outside of this module. It
+    /// should be called only when converting a StateBuilderMatches into a
+    /// StateBuilderNFA.
+    fn close_match_pattern_ids(&mut self) {
+        // If we never wrote any pattern IDs, then there's nothing to do here.
+        if !self.repr().has_pattern_ids() {
+            return;
+        }
+        let patsize = PatternID::SIZE;
+        let pattern_bytes = self.0.len() - 9;
+        // Every pattern ID uses 4 bytes, so number of bytes should be
+        // divisible by 4.
+        assert_eq!(pattern_bytes % patsize, 0);
+        // This unwrap is OK since we are guaranteed that the maximum number
+        // of possible patterns fits into a u32.
+        let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
+        wire::NE::write_u32(count32, &mut self.0[5..9]);
+    }
+
+    /// Add an NFA state ID to this state. The order in which NFA states are
+    /// added matters. It is the caller's responsibility to ensure that
+    /// duplicate NFA state IDs are not added.
+    fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) {
+        let delta = sid.as_i32() - prev.as_i32();
+        write_vari32(self.0, delta);
+        *prev = sid;
+    }
+
+    /// Return a read-only view of this state's representation.
+    fn repr(&self) -> Repr<'_> {
+        Repr(self.0.as_slice())
+    }
+}
+
+/// Write a signed 32-bit integer using zig-zag encoding.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_vari32(data: &mut Vec<u8>, n: i32) {
+    let mut un = n.to_bits() << 1;
+    if n < 0 {
+        un = !un;
+    }
+    write_varu32(data, un)
+}
+
+/// Read a signed 32-bit integer using zig-zag encoding. Also, return the
+/// number of bytes read.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_vari32(data: &[u8]) -> (i32, usize) {
+    let (un, i) = read_varu32(data);
+    let mut n = i32::from_bits(un >> 1);
+    if un & 1 != 0 {
+        n = !n;
+    }
+    (n, i)
+}
+
+/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written
+/// as a sequence of bytes where all bytes except for the last one have the
+/// most significant bit set. The least significant 7 bits correspond to the
+/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in
+/// very common cases, it uses fewer than 4.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
+    while n >= 0b1000_0000 {
+        data.push(n.low_u8() | 0b1000_0000);
+        n >>= 7;
+    }
+    data.push(n.low_u8());
+}
+
+/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
+///
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+fn read_varu32(data: &[u8]) -> (u32, usize) {
+    // N.B. We can assume correctness here since we know that all varuints are
+    // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic
+    // is all okay.
+    let mut n: u32 = 0;
+    let mut shift: u32 = 0;
+    for (i, &b) in data.iter().enumerate() {
+        if b < 0b1000_0000 {
+            return (n | (u32::from(b) << shift), i + 1);
+        }
+        n |= (u32::from(b) & 0b0111_1111) << shift;
+        shift += 7;
+    }
+    (0, 0)
+}
+
+/// Push a native-endian encoded `n` on to `dst`.
+fn write_u32(dst: &mut Vec<u8>, n: u32) {
+    use crate::util::wire::NE;
+
+    let start = dst.len();
+    dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>()));
+    NE::write_u32(n, &mut dst[start..]);
+}
+
+#[cfg(test)]
+mod tests {
+    use alloc::vec;
+
+    use quickcheck::quickcheck;
+
+    use super::*;
+
+    #[cfg(not(miri))]
+    quickcheck! {
+        fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
+            // Builders states do not permit duplicate IDs.
+            let sids = dedup_state_ids(sids);
+
+            let mut b = StateBuilderEmpty::new().into_matches().into_nfa();
+            for &sid in &sids {
+                b.add_nfa_state_id(sid);
+            }
+            let s = b.to_state();
+            let mut got = vec![];
+            s.iter_nfa_state_ids(|sid| got.push(sid));
+            got == sids
+        }
+
+        fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool {
+            // Builders states do not permit duplicate IDs.
+            let pids = dedup_pattern_ids(pids);
+
+            let mut b = StateBuilderEmpty::new().into_matches();
+            for &pid in &pids {
+                b.add_match_pattern_id(pid);
+            }
+            let s = b.into_nfa().to_state();
+            let mut got = vec![];
+            s.iter_match_pattern_ids(|pid| got.push(pid));
+            got == pids
+        }
+
+        fn prop_state_read_write_nfa_state_and_pattern_ids(
+            sids: Vec<StateID>,
+            pids: Vec<PatternID>
+        ) -> bool {
+            // Builders states do not permit duplicate IDs.
+            let sids = dedup_state_ids(sids);
+            let pids = dedup_pattern_ids(pids);
+
+            let mut b = StateBuilderEmpty::new().into_matches();
+            for &pid in &pids {
+                b.add_match_pattern_id(pid);
+            }
+
+            let mut b = b.into_nfa();
+            for &sid in &sids {
+                b.add_nfa_state_id(sid);
+            }
+
+            let s = b.to_state();
+            let mut got_pids = vec![];
+            s.iter_match_pattern_ids(|pid| got_pids.push(pid));
+            let mut got_sids = vec![];
+            s.iter_nfa_state_ids(|sid| got_sids.push(sid));
+            got_pids == pids && got_sids == sids
+        }
+    }
+
+    quickcheck! {
+        fn prop_read_write_varu32(n: u32) -> bool {
+            let mut buf = vec![];
+            write_varu32(&mut buf, n);
+            let (got, nread) = read_varu32(&buf);
+            nread == buf.len() && got == n
+        }
+
+        fn prop_read_write_vari32(n: i32) -> bool {
+            let mut buf = vec![];
+            write_vari32(&mut buf, n);
+            let (got, nread) = read_vari32(&buf);
+            nread == buf.len() && got == n
+        }
+    }
+
+    #[cfg(not(miri))]
+    fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
+        let mut set = alloc::collections::BTreeSet::new();
+        let mut deduped = vec![];
+        for sid in sids {
+            if set.contains(&sid) {
+                continue;
+            }
+            set.insert(sid);
+            deduped.push(sid);
+        }
+        deduped
+    }
+
+    #[cfg(not(miri))]
+    fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
+        let mut set = alloc::collections::BTreeSet::new();
+        let mut deduped = vec![];
+        for pid in pids {
+            if set.contains(&pid) {
+                continue;
+            }
+            set.insert(pid);
+            deduped.push(pid);
+        }
+        deduped
+    }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
commit	26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree	f435a8308119effd964b339f76abb83a57c29483 /third_party/rust/regex-automata/src/util/determinize
parent	Initial commit. (diff)
download	firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip