summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/dfa/remapper.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/dfa/remapper.rs')
-rw-r--r--third_party/rust/regex-automata/src/dfa/remapper.rs242
1 files changed, 242 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/dfa/remapper.rs b/third_party/rust/regex-automata/src/dfa/remapper.rs
new file mode 100644
index 0000000000..6e49646721
--- /dev/null
+++ b/third_party/rust/regex-automata/src/dfa/remapper.rs
@@ -0,0 +1,242 @@
+use alloc::vec::Vec;
+
+use crate::util::primitives::StateID;
+
+/// Remappable is a tightly coupled abstraction that facilitates remapping
+/// state identifiers in DFAs.
+///
+/// The main idea behind remapping state IDs is that DFAs often need to check
+/// if a certain state is a "special" state of some kind (like a match state)
+/// during a search. Since this is extremely perf critical code, we want this
+/// check to be as fast as possible. Partitioning state IDs into, for example,
+/// into "non-match" and "match" states means one can tell if a state is a
+/// match state via a simple comparison of the state ID.
+///
+/// The issue is that during the DFA construction process, it's not
+/// particularly easy to partition the states. Instead, the simplest thing is
+/// to often just do a pass over all of the states and shuffle them into their
+/// desired partitionings. To do that, we need a mechanism for swapping states.
+/// Hence, this abstraction.
+///
+/// Normally, for such little code, I would just duplicate it. But this is a
+/// key optimization and the implementation is a bit subtle. So the abstraction
+/// is basically a ham-fisted attempt at DRY. The only place we use this is in
+/// the dense and one-pass DFAs.
+///
+/// See also src/dfa/special.rs for a more detailed explanation of how dense
+/// DFAs are partitioned.
+pub(super) trait Remappable: core::fmt::Debug {
+ /// Return the total number of states.
+ fn state_len(&self) -> usize;
+ /// Return the power-of-2 exponent that yields the stride. The pertinent
+ /// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride.
+ fn stride2(&self) -> usize;
+ /// Swap the states pointed to by the given IDs. The underlying finite
+ /// state machine should be mutated such that all of the transitions in
+ /// `id1` are now in the memory region where the transitions for `id2`
+ /// were, and all of the transitions in `id2` are now in the memory region
+ /// where the transitions for `id1` were.
+ ///
+ /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
+ ///
+ /// It is expected that, after calling this, the underlying value will be
+ /// left in an inconsistent state, since any other transitions pointing to,
+ /// e.g., `id1` need to be updated to point to `id2`, since that's where
+ /// `id1` moved to.
+ ///
+ /// In order to "fix" the underlying inconsistent state, a `Remapper`
+ /// should be used to guarantee that `remap` is called at the appropriate
+ /// time.
+ fn swap_states(&mut self, id1: StateID, id2: StateID);
+ /// This must remap every single state ID in the underlying value according
+ /// to the function given. For example, in a DFA, this should remap every
+ /// transition and every starting state ID.
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID);
+}
+
+/// Remapper is an abstraction the manages the remapping of state IDs in a
+/// finite state machine. This is useful when one wants to shuffle states into
+/// different positions in the machine.
+///
+/// One of the key complexities this manages is the ability to correctly move
+/// one state multiple times.
+///
+/// Once shuffling is complete, `remap` must be called, which will rewrite
+/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
+/// will almost certainly result in a corrupt machine.
+#[derive(Debug)]
+pub(super) struct Remapper {
+ /// A map from the index of a state to its pre-multiplied identifier.
+ ///
+ /// When a state is swapped with another, then their corresponding
+ /// locations in this map are also swapped. Thus, its new position will
+ /// still point to its old pre-multiplied StateID.
+ ///
+ /// While there is a bit more to it, this then allows us to rewrite the
+ /// state IDs in a DFA's transition table in a single pass. This is done
+ /// by iterating over every ID in this map, then iterating over each
+ /// transition for the state at that ID and re-mapping the transition from
+ /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
+ /// in this map where `old_id` *started*, and set it to where it ended up
+ /// after all swaps have been completed.
+ map: Vec<StateID>,
+ /// A mapper from state index to state ID (and back).
+ idxmap: IndexMapper,
+}
+
+impl Remapper {
+ /// Create a new remapper from the given remappable implementation. The
+ /// remapper can then be used to swap states. The remappable value given
+ /// here must the same one given to `swap` and `remap`.
+ pub(super) fn new(r: &impl Remappable) -> Remapper {
+ let idxmap = IndexMapper { stride2: r.stride2() };
+ let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect();
+ Remapper { map, idxmap }
+ }
+
+ /// Swap two states. Once this is called, callers must follow through to
+ /// call `remap`, or else it's possible for the underlying remappable
+ /// value to be in a corrupt state.
+ pub(super) fn swap(
+ &mut self,
+ r: &mut impl Remappable,
+ id1: StateID,
+ id2: StateID,
+ ) {
+ if id1 == id2 {
+ return;
+ }
+ r.swap_states(id1, id2);
+ self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2));
+ }
+
+ /// Complete the remapping process by rewriting all state IDs in the
+ /// remappable value according to the swaps performed.
+ pub(super) fn remap(mut self, r: &mut impl Remappable) {
+ // Update the map to account for states that have been swapped
+ // multiple times. For example, if (A, C) and (C, G) are swapped, then
+ // transitions previously pointing to A should now point to G. But if
+ // we don't update our map, they will erroneously be set to C. All we
+ // do is follow the swaps in our map until we see our original state
+ // ID.
+ //
+ // The intuition here is to think about how changes are made to the
+ // map: only through pairwise swaps. That means that starting at any
+ // given state, it is always possible to find the loop back to that
+ // state by following the swaps represented in the map (which might be
+ // 0 swaps).
+ //
+ // We are also careful to clone the map before starting in order to
+ // freeze it. We use the frozen map to find our loops, since we need to
+ // update our map as well. Without freezing it, our updates could break
+ // the loops referenced above and produce incorrect results.
+ let oldmap = self.map.clone();
+ for i in 0..r.state_len() {
+ let cur_id = self.idxmap.to_state_id(i);
+ let mut new_id = oldmap[i];
+ if cur_id == new_id {
+ continue;
+ }
+ loop {
+ let id = oldmap[self.idxmap.to_index(new_id)];
+ if cur_id == id {
+ self.map[i] = new_id;
+ break;
+ }
+ new_id = id;
+ }
+ }
+ r.remap(|next| self.map[self.idxmap.to_index(next)]);
+ }
+}
+
+/// A simple type for mapping between state indices and state IDs.
+///
+/// The reason why this exists is because state IDs are "premultiplied." That
+/// is, in order to get to the transitions for a particular state, one need
+/// only use the state ID as-is, instead of having to multiple it by transition
+/// table's stride.
+///
+/// The downside of this is that it's inconvenient to map between state IDs
+/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
+/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`,
+/// `2`, `3`, etc.
+///
+/// Since our state IDs are premultiplied, we can convert back-and-forth
+/// between IDs and indices by simply unmultiplying the IDs and multiplying the
+/// indices.
+#[derive(Debug)]
+struct IndexMapper {
+ /// The power of 2 corresponding to the stride of the corresponding
+ /// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
+ /// stride2' pre-multiplies an index to an ID.
+ stride2: usize,
+}
+
+impl IndexMapper {
+ /// Convert a state ID to a state index.
+ fn to_index(&self, id: StateID) -> usize {
+ id.as_usize() >> self.stride2
+ }
+
+ /// Convert a state index to a state ID.
+ fn to_state_id(&self, index: usize) -> StateID {
+ // CORRECTNESS: If the given index is not valid, then it is not
+ // required for this to panic or return a valid state ID. We'll "just"
+ // wind up with panics or silent logic errors at some other point.
+ StateID::new_unchecked(index << self.stride2)
+ }
+}
+
+#[cfg(feature = "dfa-build")]
+mod dense {
+ use crate::{dfa::dense::OwnedDFA, util::primitives::StateID};
+
+ use super::Remappable;
+
+ impl Remappable for OwnedDFA {
+ fn state_len(&self) -> usize {
+ OwnedDFA::state_len(self)
+ }
+
+ fn stride2(&self) -> usize {
+ OwnedDFA::stride2(self)
+ }
+
+ fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ OwnedDFA::swap_states(self, id1, id2)
+ }
+
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ OwnedDFA::remap(self, map)
+ }
+ }
+}
+
+#[cfg(feature = "dfa-onepass")]
+mod onepass {
+ use crate::{dfa::onepass::DFA, util::primitives::StateID};
+
+ use super::Remappable;
+
+ impl Remappable for DFA {
+ fn state_len(&self) -> usize {
+ DFA::state_len(self)
+ }
+
+ fn stride2(&self) -> usize {
+ // We don't do pre-multiplication for the one-pass DFA, so
+ // returning 0 has the effect of making state IDs and state indices
+ // equivalent.
+ 0
+ }
+
+ fn swap_states(&mut self, id1: StateID, id2: StateID) {
+ DFA::swap_states(self, id1, id2)
+ }
+
+ fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
+ DFA::remap(self, map)
+ }
+ }
+}