summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs')
-rw-r--r--third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs528
1 files changed, 528 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs b/third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs
new file mode 100644
index 0000000000..7ed129afd1
--- /dev/null
+++ b/third_party/rust/regex-automata/src/nfa/thompson/literal_trie.rs
@@ -0,0 +1,528 @@
+use core::mem;
+
+use alloc::{vec, vec::Vec};
+
+use crate::{
+ nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder},
+ util::primitives::{IteratorIndexExt, StateID},
+};
+
+/// A trie that preserves leftmost-first match semantics.
+///
+/// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN'
+/// patterns. It can *only* handle alternations of literals, which makes it
+/// somewhat restricted in its scope, but literal alternations are fairly
+/// common.
+///
+/// At a 5,000 foot level, the main idea of this trie is make an alternation of
+/// literals look more like a DFA than an NFA via epsilon removal.
+///
+/// More precisely, the main issue is in how alternations are compiled into
+/// a Thompson NFA. Namely, each alternation gets a single NFA "union" state
+/// with an epsilon transition for every branch of the alternation pointing to
+/// an NFA state corresponding to the start of that branch. The main problem
+/// with this representation is the cost of computing an epsilon closure. Once
+/// you hit the alternation's start state, it acts as a sort of "clog" that
+/// requires you to traverse all of the epsilon transitions to compute the full
+/// closure.
+///
+/// While fixing such clogs in the general case is pretty tricky without going
+/// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems).
+/// But at least in the case of an alternation of literals, we can convert
+/// that to a prefix trie without too much cost. In theory, that's all you
+/// really need to do: build the trie and then compile it to a Thompson NFA.
+/// For example, if you have the pattern 'bar|baz|foo', then using a trie, it
+/// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by
+/// reducing the number of epsilon transitions out of the alternation's start
+/// state from 3 to 2 (it actually gets down to 1 when you use a sparse state,
+/// which we do below). It's a small effect here, but when your alternation is
+/// huge, the savings is also huge.
+///
+/// And that is... essentially what a LiteralTrie does. But there is one
+/// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile
+/// that when leftmost-first semantics are used? If 'sam|samwise' was the
+/// entire regex, then you could just drop the 'samwise' branch entirely since
+/// it is impossible to match ('sam' will always take priority, and since it
+/// is a prefix of 'samwise', 'samwise' will never match). But what about the
+/// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because
+/// it might match when 'sam' doesn't fall on a word boundary.
+///
+/// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)',
+/// which is a precisely equivalent regex that also gets rid of the clog.
+///
+/// Another example is 'zapper|z|zap'. That gets translated to
+/// 'z(?:apper||ap)'.
+///
+/// We accomplish this by giving each state in the trie multiple "chunks" of
+/// transitions. Each chunk barrier represents a match. The idea is that once
+/// you know a match occurs, none of the transitions after the match can be
+/// re-ordered and mixed in with the transitions before the match. Otherwise,
+/// the match semantics could be changed.
+///
+/// See the 'State' data type for a bit more detail.
+///
+/// Future work:
+///
+/// * In theory, it would be nice to generalize the idea of removing clogs and
+/// apply it to the NFA graph itself. Then this could in theory work for
+/// case insensitive alternations of literals, or even just alternations where
+/// each branch starts with a non-epsilon transition.
+/// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick
+/// crate deals with leftmost-first matches correctly, but I think this implies
+/// encoding failure transitions into a Thompson NFA somehow. Which seems fine,
+/// because failure transitions are just unconditional epsilon transitions?
+/// * Or perhaps even better, could we use an aho_corasick::AhoCorasick
+/// directly? At time of writing, 0.7 is the current version of the
+/// aho-corasick crate, and that definitely cannot be used as-is. But if we
+/// expose the underlying finite state machine API, then could we use it? That
+/// would be super. If we could figure that out, it might also lend itself to
+/// more general composition of finite state machines.
+#[derive(Clone)]
+pub(crate) struct LiteralTrie {
+ /// The set of trie states. Each state contains one or more chunks, where
+ /// each chunk is a sparse set of transitions to other states. A leaf state
+ /// is always a match state that contains only empty chunks (i.e., no
+ /// transitions).
+ states: Vec<State>,
+ /// Whether to add literals in reverse to the trie. Useful when building
+ /// a reverse NFA automaton.
+ rev: bool,
+}
+
+impl LiteralTrie {
+ /// Create a new literal trie that adds literals in the forward direction.
+ pub(crate) fn forward() -> LiteralTrie {
+ let root = State::default();
+ LiteralTrie { states: vec![root], rev: false }
+ }
+
+ /// Create a new literal trie that adds literals in reverse.
+ pub(crate) fn reverse() -> LiteralTrie {
+ let root = State::default();
+ LiteralTrie { states: vec![root], rev: true }
+ }
+
+ /// Add the given literal to this trie.
+ ///
+ /// If the literal could not be added because the `StateID` space was
+ /// exhausted, then an error is returned. If an error returns, the trie
+ /// is in an unspecified state.
+ pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> {
+ let mut prev = StateID::ZERO;
+ let mut it = bytes.iter().copied();
+ while let Some(b) = if self.rev { it.next_back() } else { it.next() } {
+ prev = self.get_or_add_state(prev, b)?;
+ }
+ self.states[prev].add_match();
+ Ok(())
+ }
+
+ /// If the given transition is defined, then return the next state ID.
+ /// Otherwise, add the transition to `from` and point it to a new state.
+ ///
+ /// If a new state ID could not be allocated, then an error is returned.
+ fn get_or_add_state(
+ &mut self,
+ from: StateID,
+ byte: u8,
+ ) -> Result<StateID, BuildError> {
+ let active = self.states[from].active_chunk();
+ match active.binary_search_by_key(&byte, |t| t.byte) {
+ Ok(i) => Ok(active[i].next),
+ Err(i) => {
+ // Add a new state and get its ID.
+ let next = StateID::new(self.states.len()).map_err(|_| {
+ BuildError::too_many_states(self.states.len())
+ })?;
+ self.states.push(State::default());
+ // Offset our position to account for all transitions and not
+ // just the ones in the active chunk.
+ let i = self.states[from].active_chunk_start() + i;
+ let t = Transition { byte, next };
+ self.states[from].transitions.insert(i, t);
+ Ok(next)
+ }
+ }
+ }
+
+ /// Compile this literal trie to the NFA builder given.
+ ///
+ /// This forwards any errors that may occur while using the given builder.
+ pub(crate) fn compile(
+ &self,
+ builder: &mut Builder,
+ ) -> Result<ThompsonRef, BuildError> {
+ // Compilation proceeds via depth-first traversal of the trie.
+ //
+ // This is overall pretty brutal. The recursive version of this is
+ // deliciously simple. (See 'compile_to_hir' below for what it might
+ // look like.) But recursion on a trie means your call stack grows
+ // in accordance with the longest literal, which just does not seem
+ // appropriate. So we push the call stack to the heap. But as a result,
+ // the trie traversal becomes pretty brutal because we essentially
+ // have to encode the state of a double for-loop into an explicit call
+ // frame. If someone can simplify this without using recursion, that'd
+ // be great.
+
+ // 'end' is our match state for this trie, but represented in the the
+ // NFA. Any time we see a match in the trie, we insert a transition
+ // from the current state we're in to 'end'.
+ let end = builder.add_empty()?;
+ let mut stack = vec![];
+ let mut f = Frame::new(&self.states[StateID::ZERO]);
+ loop {
+ if let Some(t) = f.transitions.next() {
+ if self.states[t.next].is_leaf() {
+ f.sparse.push(thompson::Transition {
+ start: t.byte,
+ end: t.byte,
+ next: end,
+ });
+ } else {
+ f.sparse.push(thompson::Transition {
+ start: t.byte,
+ end: t.byte,
+ // This is a little funny, but when the frame we create
+ // below completes, it will pop this parent frame off
+ // and modify this transition to point to the correct
+ // state.
+ next: StateID::ZERO,
+ });
+ stack.push(f);
+ f = Frame::new(&self.states[t.next]);
+ }
+ continue;
+ }
+ // At this point, we have visited all transitions in f.chunk, so
+ // add it as a sparse NFA state. Unless the chunk was empty, in
+ // which case, we don't do anything.
+ if !f.sparse.is_empty() {
+ let chunk_id = if f.sparse.len() == 1 {
+ builder.add_range(f.sparse.pop().unwrap())?
+ } else {
+ let sparse = mem::replace(&mut f.sparse, vec![]);
+ builder.add_sparse(sparse)?
+ };
+ f.union.push(chunk_id);
+ }
+ // Now we need to look to see if there are other chunks to visit.
+ if let Some(chunk) = f.chunks.next() {
+ // If we're here, it means we're on the second (or greater)
+ // chunk, which implies there is a match at this point. So
+ // connect this state to the final end state.
+ f.union.push(end);
+ // Advance to the next chunk.
+ f.transitions = chunk.iter();
+ continue;
+ }
+ // Now that we are out of chunks, we have completely visited
+ // this state. So turn our union of chunks into an NFA union
+ // state, and add that union state to the parent state's current
+ // sparse state. (If there is no parent, we're done.)
+ let start = builder.add_union(f.union)?;
+ match stack.pop() {
+ None => {
+ return Ok(ThompsonRef { start, end });
+ }
+ Some(mut parent) => {
+ // OK because the only way a frame gets pushed on to the
+ // stack (aside from the root) is when a transition has
+ // been added to 'sparse'.
+ parent.sparse.last_mut().unwrap().next = start;
+ f = parent;
+ }
+ }
+ }
+ }
+
+ /// Converts this trie to an equivalent HIR expression.
+ ///
+ /// We don't actually use this, but it's useful for tests. In particular,
+ /// it provides a (somewhat) human readable representation of the trie
+ /// itself.
+ #[cfg(test)]
+ fn compile_to_hir(&self) -> regex_syntax::hir::Hir {
+ self.compile_state_to_hir(StateID::ZERO)
+ }
+
+ /// The recursive implementation of 'to_hir'.
+ ///
+ /// Notice how simple this is compared to 'compile' above. 'compile' could
+ /// be similarly simple, but we opt to not use recursion in order to avoid
+ /// overflowing the stack in the case of a longer literal.
+ #[cfg(test)]
+ fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir {
+ use regex_syntax::hir::Hir;
+
+ let mut alt = vec![];
+ for (i, chunk) in self.states[sid].chunks().enumerate() {
+ if i > 0 {
+ alt.push(Hir::empty());
+ }
+ if chunk.is_empty() {
+ continue;
+ }
+ let mut chunk_alt = vec![];
+ for t in chunk.iter() {
+ chunk_alt.push(Hir::concat(vec![
+ Hir::literal(vec![t.byte]),
+ self.compile_state_to_hir(t.next),
+ ]));
+ }
+ alt.push(Hir::alternation(chunk_alt));
+ }
+ Hir::alternation(alt)
+ }
+}
+
+impl core::fmt::Debug for LiteralTrie {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ writeln!(f, "LiteralTrie(")?;
+ for (sid, state) in self.states.iter().with_state_ids() {
+ writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?;
+ }
+ writeln!(f, ")")?;
+ Ok(())
+ }
+}
+
+/// An explicit stack frame used for traversing the trie without using
+/// recursion.
+///
+/// Each frame is tied to the traversal of a single trie state. The frame is
+/// dropped once the entire state (and all of its children) have been visited.
+/// The "output" of compiling a state is the 'union' vector, which is turn
+/// converted to a NFA union state. Each branch of the union corresponds to a
+/// chunk in the trie state.
+///
+/// 'sparse' corresponds to the set of transitions for a particular chunk in a
+/// trie state. It is ultimately converted to an NFA sparse state. The 'sparse'
+/// field, after being converted to a sparse NFA state, is reused for any
+/// subsequent chunks in the trie state, if any exist.
+#[derive(Debug)]
+struct Frame<'a> {
+ /// The remaining chunks to visit for a trie state.
+ chunks: StateChunksIter<'a>,
+ /// The transitions of the current chunk that we're iterating over. Since
+ /// every trie state has at least one chunk, every frame is initialized
+ /// with the first chunk's transitions ready to be consumed.
+ transitions: core::slice::Iter<'a, Transition>,
+ /// The NFA state IDs pointing to the start of each chunk compiled by
+ /// this trie state. This ultimately gets converted to an NFA union once
+ /// the entire trie state (and all of its children) have been compiled.
+ /// The order of these matters for leftmost-first match semantics, since
+ /// earlier matches in the union are preferred over later ones.
+ union: Vec<StateID>,
+ /// The actual NFA transitions for a single chunk in a trie state. This
+ /// gets converted to an NFA sparse state, and its corresponding NFA state
+ /// ID should get added to 'union'.
+ sparse: Vec<thompson::Transition>,
+}
+
+impl<'a> Frame<'a> {
+ /// Create a new stack frame for trie traversal. This initializes the
+ /// 'transitions' iterator to the transitions for the first chunk, with the
+ /// 'chunks' iterator being every chunk after the first one.
+ fn new(state: &'a State) -> Frame<'a> {
+ let mut chunks = state.chunks();
+ // every state has at least 1 chunk
+ let chunk = chunks.next().unwrap();
+ let transitions = chunk.iter();
+ Frame { chunks, transitions, union: vec![], sparse: vec![] }
+ }
+}
+
+/// A state in a trie.
+///
+/// This uses a sparse representation. Since we don't use literal tries
+/// for searching, and ultimately (and compilation requires visiting every
+/// transition anyway), we use a sparse representation for transitions. This
+/// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps
+/// a bit slower.
+///
+/// While 'transitions' is pretty standard as far as tries goes, the 'chunks'
+/// piece here is more unusual. In effect, 'chunks' defines a partitioning
+/// of 'transitions', where each chunk corresponds to a distinct set of
+/// transitions. The key invariant is that a transition in one chunk cannot
+/// be moved to another chunk. This is the secret sauce that preserve
+/// leftmost-first match semantics.
+///
+/// A new chunk is added whenever we mark a state as a match state. Once a
+/// new chunk is added, the old active chunk is frozen and is never mutated
+/// again. The new chunk becomes the active chunk, which is defined as
+/// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where
+/// 'chunks' is empty actually contains one chunk. Thus, every state contains
+/// at least one (possibly empty) chunk.
+///
+/// A "leaf" state is a state that has no outgoing transitions (so
+/// 'transitions' is empty). Note that there is no way for a leaf state to be a
+/// non-matching state. (Although while building the trie, within 'add', a leaf
+/// state may exist while not containing any matches. But this invariant is
+/// only broken within 'add'. Once 'add' returns, the invariant is upheld.)
+#[derive(Clone, Default)]
+struct State {
+ transitions: Vec<Transition>,
+ chunks: Vec<(usize, usize)>,
+}
+
+impl State {
+ /// Mark this state as a match state and freeze the active chunk such that
+ /// it can not be further mutated.
+ fn add_match(&mut self) {
+ // This is not strictly necessary, but there's no point in recording
+ // another match by adding another chunk if the state has no
+ // transitions. Note though that we only skip this if we already know
+ // this is a match state, which is only true if 'chunks' is not empty.
+ // Basically, if we didn't do this, nothing semantically would change,
+ // but we'd end up pushing another chunk and potentially triggering an
+ // alloc.
+ if self.transitions.is_empty() && !self.chunks.is_empty() {
+ return;
+ }
+ let chunk_start = self.active_chunk_start();
+ let chunk_end = self.transitions.len();
+ self.chunks.push((chunk_start, chunk_end));
+ }
+
+ /// Returns true if and only if this state is a leaf state. That is, a
+ /// state that has no outgoing transitions.
+ fn is_leaf(&self) -> bool {
+ self.transitions.is_empty()
+ }
+
+ /// Returns an iterator over all of the chunks (including the currently
+ /// active chunk) in this state. Since the active chunk is included, the
+ /// iterator is guaranteed to always yield at least one chunk (although the
+ /// chunk may be empty).
+ fn chunks(&self) -> StateChunksIter<'_> {
+ StateChunksIter {
+ transitions: &*self.transitions,
+ chunks: self.chunks.iter(),
+ active: Some(self.active_chunk()),
+ }
+ }
+
+ /// Returns the active chunk as a slice of transitions.
+ fn active_chunk(&self) -> &[Transition] {
+ let start = self.active_chunk_start();
+ &self.transitions[start..]
+ }
+
+ /// Returns the index into 'transitions' where the active chunk starts.
+ fn active_chunk_start(&self) -> usize {
+ self.chunks.last().map_or(0, |&(_, end)| end)
+ }
+}
+
+impl core::fmt::Debug for State {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ let mut spacing = " ";
+ for (i, chunk) in self.chunks().enumerate() {
+ if i > 0 {
+ write!(f, "{}MATCH", spacing)?;
+ }
+ spacing = "";
+ for (j, t) in chunk.iter().enumerate() {
+ spacing = " ";
+ if j == 0 && i > 0 {
+ write!(f, " ")?;
+ } else if j > 0 {
+ write!(f, ", ")?;
+ }
+ write!(f, "{:?}", t)?;
+ }
+ }
+ Ok(())
+ }
+}
+
+/// An iterator over all of the chunks in a state, including the active chunk.
+///
+/// This iterator is created by `State::chunks`. We name this iterator so that
+/// we can include it in the `Frame` type for non-recursive trie traversal.
+#[derive(Debug)]
+struct StateChunksIter<'a> {
+ transitions: &'a [Transition],
+ chunks: core::slice::Iter<'a, (usize, usize)>,
+ active: Option<&'a [Transition]>,
+}
+
+impl<'a> Iterator for StateChunksIter<'a> {
+ type Item = &'a [Transition];
+
+ fn next(&mut self) -> Option<&'a [Transition]> {
+ if let Some(&(start, end)) = self.chunks.next() {
+ return Some(&self.transitions[start..end]);
+ }
+ if let Some(chunk) = self.active.take() {
+ return Some(chunk);
+ }
+ None
+ }
+}
+
+/// A single transition in a trie to another state.
+#[derive(Clone, Copy)]
+struct Transition {
+ byte: u8,
+ next: StateID,
+}
+
+impl core::fmt::Debug for Transition {
+ fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+ write!(
+ f,
+ "{:?} => {}",
+ crate::util::escape::DebugByte(self.byte),
+ self.next.as_usize()
+ )
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use bstr::B;
+ use regex_syntax::hir::Hir;
+
+ use super::*;
+
+ #[test]
+ fn zap() {
+ let mut trie = LiteralTrie::forward();
+ trie.add(b"zapper").unwrap();
+ trie.add(b"z").unwrap();
+ trie.add(b"zap").unwrap();
+
+ let got = trie.compile_to_hir();
+ let expected = Hir::concat(vec![
+ Hir::literal(B("z")),
+ Hir::alternation(vec![
+ Hir::literal(B("apper")),
+ Hir::empty(),
+ Hir::literal(B("ap")),
+ ]),
+ ]);
+ assert_eq!(expected, got);
+ }
+
+ #[test]
+ fn maker() {
+ let mut trie = LiteralTrie::forward();
+ trie.add(b"make").unwrap();
+ trie.add(b"maple").unwrap();
+ trie.add(b"maker").unwrap();
+
+ let got = trie.compile_to_hir();
+ let expected = Hir::concat(vec![
+ Hir::literal(B("ma")),
+ Hir::alternation(vec![
+ Hir::concat(vec![
+ Hir::literal(B("ke")),
+ Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]),
+ ]),
+ Hir::literal(B("ple")),
+ ]),
+ ]);
+ assert_eq!(expected, got);
+ }
+}