Merging upstream version 1.67.1+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:18:25 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-17 12:18:25 +0000
commit: 5363f350887b1e5b5dd21a86f88c8af9d7fea6da (patch)
tree: 35ca005eb6e0e9a1ba3bb5dbc033209ad445dc17 /vendor/regex-automata/src/nfa/compiler.rs
parent: Adding debian version 1.66.0+dfsg1-1. (diff)
download: rustc-5363f350887b1e5b5dd21a86f88c8af9d7fea6da.tar.xz
rustc-5363f350887b1e5b5dd21a86f88c8af9d7fea6da.zip
1 files changed, 0 insertions, 1193 deletions
diff --git a/vendor/regex-automata/src/nfa/compiler.rs b/vendor/regex-automata/src/nfa/compiler.rs
deleted file mode 100644
index d9b3945b3..000000000
--- a/vendor/regex-automata/src/nfa/compiler.rs
+++ /dev/null
@@ -1,1193 +0,0 @@
-// This module provides an NFA compiler using Thompson's construction
-// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
-// graph as output. The NFA graph is structured in a way that permits it to be
-// executed by a virtual machine and also used to efficiently build a DFA.
-//
-// The compiler deals with a slightly expanded set of NFA states that notably
-// includes an empty node that has exactly one epsilon transition to the next
-// state. In other words, it's a "goto" instruction if one views Thompson's NFA
-// as a set of bytecode instructions. These goto instructions are removed in
-// a subsequent phase before returning the NFA to the caller. The purpose of
-// these empty nodes is that they make the construction algorithm substantially
-// simpler to implement. We remove them before returning to the caller because
-// they can represent substantial overhead when traversing the NFA graph
-// (either while searching using the NFA directly or while building a DFA).
-//
-// In the future, it would be nice to provide a Glushkov compiler as well,
-// as it would work well as a bit-parallel NFA for smaller regexes. But
-// the Thompson construction is one I'm more familiar with and seems more
-// straight-forward to deal with when it comes to large Unicode character
-// classes.
-//
-// Internally, the compiler uses interior mutability to improve composition
-// in the face of the borrow checker. In particular, we'd really like to be
-// able to write things like this:
-//
-//     self.c_concat(exprs.iter().map(|e| self.c(e)))
-//
-// Which elegantly uses iterators to build up a sequence of compiled regex
-// sub-expressions and then hands it off to the concatenating compiler
-// routine. Without interior mutability, the borrow checker won't let us
-// borrow `self` mutably both inside and outside the closure at the same
-// time.
-
-use std::cell::RefCell;
-use std::mem;
-
-use regex_syntax::hir::{self, Hir, HirKind};
-use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
-
-use classes::ByteClassSet;
-use error::{Error, Result};
-use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
-use nfa::range_trie::RangeTrie;
-use nfa::{State, StateID, Transition, NFA};
-
-/// Config knobs for the NFA compiler. See the builder's methods for more
-/// docs on each one.
-#[derive(Clone, Copy, Debug)]
-struct Config {
-    anchored: bool,
-    allow_invalid_utf8: bool,
-    reverse: bool,
-    shrink: bool,
-}
-
-impl Default for Config {
-    fn default() -> Config {
-        Config {
-            anchored: false,
-            allow_invalid_utf8: false,
-            reverse: false,
-            shrink: true,
-        }
-    }
-}
-
-/// A builder for compiling an NFA.
-#[derive(Clone, Debug)]
-pub struct Builder {
-    config: Config,
-}
-
-impl Builder {
-    /// Create a new NFA builder with its default configuration.
-    pub fn new() -> Builder {
-        Builder { config: Config::default() }
-    }
-
-    /// Compile the given high level intermediate representation of a regular
-    /// expression into an NFA.
-    ///
-    /// If there was a problem building the NFA, then an error is returned.
-    /// For example, if the regex uses unsupported features (such as zero-width
-    /// assertions), then an error is returned.
-    pub fn build(&self, expr: &Hir) -> Result<NFA> {
-        let mut nfa = NFA::always_match();
-        self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
-        Ok(nfa)
-    }
-
-    /// Compile the given high level intermediate representation of a regular
-    /// expression into the NFA given using the given compiler. Callers may
-    /// prefer this over `build` if they would like to reuse allocations while
-    /// compiling many regular expressions.
-    ///
-    /// On success, the given NFA is completely overwritten with the NFA
-    /// produced by the compiler.
-    ///
-    /// If there was a problem building the NFA, then an error is returned. For
-    /// example, if the regex uses unsupported features (such as zero-width
-    /// assertions), then an error is returned. When an error is returned,
-    /// the contents of `nfa` are unspecified and should not be relied upon.
-    /// However, it can still be reused in subsequent calls to this method.
-    pub fn build_with(
-        &self,
-        compiler: &mut Compiler,
-        nfa: &mut NFA,
-        expr: &Hir,
-    ) -> Result<()> {
-        compiler.clear();
-        compiler.configure(self.config);
-        compiler.compile(nfa, expr)
-    }
-
-    /// Set whether matching must be anchored at the beginning of the input.
-    ///
-    /// When enabled, a match must begin at the start of the input. When
-    /// disabled, the NFA will act as if the pattern started with a `.*?`,
-    /// which enables a match to appear anywhere.
-    ///
-    /// By default this is disabled.
-    pub fn anchored(&mut self, yes: bool) -> &mut Builder {
-        self.config.anchored = yes;
-        self
-    }
-
-    /// When enabled, the builder will permit the construction of an NFA that
-    /// may match invalid UTF-8.
-    ///
-    /// When disabled (the default), the builder is guaranteed to produce a
-    /// regex that will only ever match valid UTF-8 (otherwise, the builder
-    /// will return an error).
-    pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
-        self.config.allow_invalid_utf8 = yes;
-        self
-    }
-
-    /// Reverse the NFA.
-    ///
-    /// A NFA reversal is performed by reversing all of the concatenated
-    /// sub-expressions in the original pattern, recursively. The resulting
-    /// NFA can be used to match the pattern starting from the end of a string
-    /// instead of the beginning of a string.
-    ///
-    /// Reversing the NFA is useful for building a reverse DFA, which is most
-    /// useful for finding the start of a match.
-    pub fn reverse(&mut self, yes: bool) -> &mut Builder {
-        self.config.reverse = yes;
-        self
-    }
-
-    /// Apply best effort heuristics to shrink the NFA at the expense of more
-    /// time/memory.
-    ///
-    /// This is enabled by default. Generally speaking, if one is using an NFA
-    /// to compile DFA, then the extra time used to shrink the NFA will be
-    /// more than made up for during DFA construction (potentially by a lot).
-    /// In other words, enabling this can substantially decrease the overall
-    /// amount of time it takes to build a DFA.
-    ///
-    /// The only reason to disable this if you want to compile an NFA and start
-    /// using it as quickly as possible without needing to build a DFA.
-    pub fn shrink(&mut self, yes: bool) -> &mut Builder {
-        self.config.shrink = yes;
-        self
-    }
-}
-
-/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
-/// construction. Namely, this compiler permits epsilon transitions between
-/// states.
-///
-/// Users of this crate cannot use a compiler directly. Instead, all one can
-/// do is create one and use it via the
-/// [`Builder::build_with`](struct.Builder.html#method.build_with)
-/// method. This permits callers to reuse compilers in order to amortize
-/// allocations.
-#[derive(Clone, Debug)]
-pub struct Compiler {
-    /// The set of compiled NFA states. Once a state is compiled, it is
-    /// assigned a state ID equivalent to its index in this list. Subsequent
-    /// compilation can modify previous states by adding new transitions.
-    states: RefCell<Vec<CState>>,
-    /// The configuration from the builder.
-    config: Config,
-    /// State used for compiling character classes to UTF-8 byte automata.
-    /// State is not retained between character class compilations. This just
-    /// serves to amortize allocation to the extent possible.
-    utf8_state: RefCell<Utf8State>,
-    /// State used for arranging character classes in reverse into a trie.
-    trie_state: RefCell<RangeTrie>,
-    /// State used for caching common suffixes when compiling reverse UTF-8
-    /// automata (for Unicode character classes).
-    utf8_suffix: RefCell<Utf8SuffixMap>,
-    /// A map used to re-map state IDs when translating the compiler's internal
-    /// NFA state representation to the external NFA representation.
-    remap: RefCell<Vec<StateID>>,
-    /// A set of compiler internal state IDs that correspond to states that are
-    /// exclusively epsilon transitions, i.e., goto instructions, combined with
-    /// the state that they point to. This is used to record said states while
-    /// transforming the compiler's internal NFA representation to the external
-    /// form.
-    empties: RefCell<Vec<(StateID, StateID)>>,
-}
-
-/// A compiler intermediate state representation for an NFA that is only used
-/// during compilation. Once compilation is done, `CState`s are converted to
-/// `State`s, which have a much simpler representation.
-#[derive(Clone, Debug, Eq, PartialEq)]
-enum CState {
-    /// An empty state whose only purpose is to forward the automaton to
-    /// another state via en epsilon transition. These are useful during
-    /// compilation but are otherwise removed at the end.
-    Empty { next: StateID },
-    /// A state that only transitions to `next` if the current input byte is
-    /// in the range `[start, end]` (inclusive on both ends).
-    Range { range: Transition },
-    /// A state with possibly many transitions, represented in a sparse
-    /// fashion. Transitions are ordered lexicographically by input range.
-    /// As such, this may only be used when every transition has equal
-    /// priority. (In practice, this is only used for encoding large UTF-8
-    /// automata.)
-    Sparse { ranges: Vec<Transition> },
-    /// An alternation such that there exists an epsilon transition to all
-    /// states in `alternates`, where matches found via earlier transitions
-    /// are preferred over later transitions.
-    Union { alternates: Vec<StateID> },
-    /// An alternation such that there exists an epsilon transition to all
-    /// states in `alternates`, where matches found via later transitions
-    /// are preferred over earlier transitions.
-    ///
-    /// This "reverse" state exists for convenience during compilation that
-    /// permits easy construction of non-greedy combinations of NFA states.
-    /// At the end of compilation, Union and UnionReverse states are merged
-    /// into one Union type of state, where the latter has its epsilon
-    /// transitions reversed to reflect the priority inversion.
-    UnionReverse { alternates: Vec<StateID> },
-    /// A match state. There is exactly one such occurrence of this state in
-    /// an NFA.
-    Match,
-}
-
-/// A value that represents the result of compiling a sub-expression of a
-/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
-/// has an initial state at `start` and a final state at `end`.
-#[derive(Clone, Copy, Debug)]
-pub struct ThompsonRef {
-    start: StateID,
-    end: StateID,
-}
-
-impl Compiler {
-    /// Create a new compiler.
-    pub fn new() -> Compiler {
-        Compiler {
-            states: RefCell::new(vec![]),
-            config: Config::default(),
-            utf8_state: RefCell::new(Utf8State::new()),
-            trie_state: RefCell::new(RangeTrie::new()),
-            utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
-            remap: RefCell::new(vec![]),
-            empties: RefCell::new(vec![]),
-        }
-    }
-
-    /// Clear any memory used by this compiler such that it is ready to compile
-    /// a new regex.
-    ///
-    /// It is preferrable to reuse a compiler if possible in order to reuse
-    /// allocations.
-    fn clear(&self) {
-        self.states.borrow_mut().clear();
-        // We don't need to clear anything else since they are cleared on
-        // their own and only when they are used.
-    }
-
-    /// Configure this compiler from the builder's knobs.
-    ///
-    /// The compiler is always reconfigured by the builder before using it to
-    /// build an NFA.
-    fn configure(&mut self, config: Config) {
-        self.config = config;
-    }
-
-    /// Convert the current intermediate NFA to its final compiled form.
-    fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
-        nfa.anchored = self.config.anchored;
-
-        let mut start = self.add_empty();
-        if !nfa.anchored {
-            let compiled = if self.config.allow_invalid_utf8 {
-                self.c_unanchored_prefix_invalid_utf8()?
-            } else {
-                self.c_unanchored_prefix_valid_utf8()?
-            };
-            self.patch(start, compiled.start);
-            start = compiled.end;
-        }
-        let compiled = self.c(&expr)?;
-        let match_id = self.add_match();
-        self.patch(start, compiled.start);
-        self.patch(compiled.end, match_id);
-        self.finish(nfa);
-        Ok(())
-    }
-
-    /// Finishes the compilation process and populates the provide NFA with
-    /// the final graph.
-    fn finish(&self, nfa: &mut NFA) {
-        let mut bstates = self.states.borrow_mut();
-        let mut remap = self.remap.borrow_mut();
-        remap.resize(bstates.len(), 0);
-        let mut empties = self.empties.borrow_mut();
-        empties.clear();
-
-        // We don't reuse allocations here becuase this is what we're
-        // returning.
-        nfa.states.clear();
-        let mut byteset = ByteClassSet::new();
-
-        // The idea here is to convert our intermediate states to their final
-        // form. The only real complexity here is the process of converting
-        // transitions, which are expressed in terms of state IDs. The new
-        // set of states will be smaller because of partial epsilon removal,
-        // so the state IDs will not be the same.
-        for (id, bstate) in bstates.iter_mut().enumerate() {
-            match *bstate {
-                CState::Empty { next } => {
-                    // Since we're removing empty states, we need to handle
-                    // them later since we don't yet know which new state this
-                    // empty state will be mapped to.
-                    empties.push((id, next));
-                }
-                CState::Range { ref range } => {
-                    remap[id] = nfa.states.len();
-                    byteset.set_range(range.start, range.end);
-                    nfa.states.push(State::Range { range: range.clone() });
-                }
-                CState::Sparse { ref mut ranges } => {
-                    remap[id] = nfa.states.len();
-
-                    let ranges = mem::replace(ranges, vec![]);
-                    for r in &ranges {
-                        byteset.set_range(r.start, r.end);
-                    }
-                    nfa.states.push(State::Sparse {
-                        ranges: ranges.into_boxed_slice(),
-                    });
-                }
-                CState::Union { ref mut alternates } => {
-                    remap[id] = nfa.states.len();
-
-                    let alternates = mem::replace(alternates, vec![]);
-                    nfa.states.push(State::Union {
-                        alternates: alternates.into_boxed_slice(),
-                    });
-                }
-                CState::UnionReverse { ref mut alternates } => {
-                    remap[id] = nfa.states.len();
-
-                    let mut alternates = mem::replace(alternates, vec![]);
-                    alternates.reverse();
-                    nfa.states.push(State::Union {
-                        alternates: alternates.into_boxed_slice(),
-                    });
-                }
-                CState::Match => {
-                    remap[id] = nfa.states.len();
-                    nfa.states.push(State::Match);
-                }
-            }
-        }
-        for &(empty_id, mut empty_next) in empties.iter() {
-            // empty states can point to other empty states, forming a chain.
-            // So we must follow the chain until the end, which must end at
-            // a non-empty state, and therefore, a state that is correctly
-            // remapped. We are guaranteed to terminate because our compiler
-            // never builds a loop among empty states.
-            while let CState::Empty { next } = bstates[empty_next] {
-                empty_next = next;
-            }
-            remap[empty_id] = remap[empty_next];
-        }
-        for state in &mut nfa.states {
-            state.remap(&remap);
-        }
-        // The compiler always begins the NFA at the first state.
-        nfa.start = remap[0];
-        nfa.byte_classes = byteset.byte_classes();
-    }
-
-    fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
-        match *expr.kind() {
-            HirKind::Empty => {
-                let id = self.add_empty();
-                Ok(ThompsonRef { start: id, end: id })
-            }
-            HirKind::Literal(hir::Literal::Unicode(ch)) => {
-                let mut buf = [0; 4];
-                let it = ch
-                    .encode_utf8(&mut buf)
-                    .as_bytes()
-                    .iter()
-                    .map(|&b| Ok(self.c_range(b, b)));
-                self.c_concat(it)
-            }
-            HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
-            HirKind::Class(hir::Class::Bytes(ref cls)) => {
-                self.c_byte_class(cls)
-            }
-            HirKind::Class(hir::Class::Unicode(ref cls)) => {
-                self.c_unicode_class(cls)
-            }
-            HirKind::Repetition(ref rep) => self.c_repetition(rep),
-            HirKind::Group(ref group) => self.c(&*group.hir),
-            HirKind::Concat(ref exprs) => {
-                self.c_concat(exprs.iter().map(|e| self.c(e)))
-            }
-            HirKind::Alternation(ref exprs) => {
-                self.c_alternation(exprs.iter().map(|e| self.c(e)))
-            }
-            HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
-            HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
-        }
-    }
-
-    fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
-    where
-        I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
-    {
-        let first =
-            if self.config.reverse { it.next_back() } else { it.next() };
-        let ThompsonRef { start, mut end } = match first {
-            Some(result) => result?,
-            None => return Ok(self.c_empty()),
-        };
-        loop {
-            let next =
-                if self.config.reverse { it.next_back() } else { it.next() };
-            let compiled = match next {
-                Some(result) => result?,
-                None => break,
-            };
-            self.patch(end, compiled.start);
-            end = compiled.end;
-        }
-        Ok(ThompsonRef { start, end })
-    }
-
-    fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
-    where
-        I: Iterator<Item = Result<ThompsonRef>>,
-    {
-        let first = it.next().expect("alternations must be non-empty")?;
-        let second = match it.next() {
-            None => return Ok(first),
-            Some(result) => result?,
-        };
-
-        let union = self.add_union();
-        let end = self.add_empty();
-        self.patch(union, first.start);
-        self.patch(first.end, end);
-        self.patch(union, second.start);
-        self.patch(second.end, end);
-        for result in it {
-            let compiled = result?;
-            self.patch(union, compiled.start);
-            self.patch(compiled.end, end);
-        }
-        Ok(ThompsonRef { start: union, end })
-    }
-
-    fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
-        match rep.kind {
-            hir::RepetitionKind::ZeroOrOne => {
-                self.c_zero_or_one(&rep.hir, rep.greedy)
-            }
-            hir::RepetitionKind::ZeroOrMore => {
-                self.c_at_least(&rep.hir, rep.greedy, 0)
-            }
-            hir::RepetitionKind::OneOrMore => {
-                self.c_at_least(&rep.hir, rep.greedy, 1)
-            }
-            hir::RepetitionKind::Range(ref rng) => match *rng {
-                hir::RepetitionRange::Exactly(count) => {
-                    self.c_exactly(&rep.hir, count)
-                }
-                hir::RepetitionRange::AtLeast(m) => {
-                    self.c_at_least(&rep.hir, rep.greedy, m)
-                }
-                hir::RepetitionRange::Bounded(min, max) => {
-                    self.c_bounded(&rep.hir, rep.greedy, min, max)
-                }
-            },
-        }
-    }
-
-    fn c_bounded(
-        &self,
-        expr: &Hir,
-        greedy: bool,
-        min: u32,
-        max: u32,
-    ) -> Result<ThompsonRef> {
-        let prefix = self.c_exactly(expr, min)?;
-        if min == max {
-            return Ok(prefix);
-        }
-
-        // It is tempting here to compile the rest here as a concatenation
-        // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
-        // were `aaa?a?a?`. The problem here is that it leads to this program:
-        //
-        //     >000000: 61 => 01
-        //     000001: 61 => 02
-        //     000002: alt(03, 04)
-        //     000003: 61 => 04
-        //     000004: alt(05, 06)
-        //     000005: 61 => 06
-        //     000006: alt(07, 08)
-        //     000007: 61 => 08
-        //     000008: MATCH
-        //
-        // And effectively, once you hit state 2, the epsilon closure will
-        // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
-        // better to instead compile it like so:
-        //
-        //     >000000: 61 => 01
-        //      000001: 61 => 02
-        //      000002: alt(03, 08)
-        //      000003: 61 => 04
-        //      000004: alt(05, 08)
-        //      000005: 61 => 06
-        //      000006: alt(07, 08)
-        //      000007: 61 => 08
-        //      000008: MATCH
-        //
-        // So that the epsilon closure of state 2 is now just 3 and 8.
-        let empty = self.add_empty();
-        let mut prev_end = prefix.end;
-        for _ in min..max {
-            let union = if greedy {
-                self.add_union()
-            } else {
-                self.add_reverse_union()
-            };
-            let compiled = self.c(expr)?;
-            self.patch(prev_end, union);
-            self.patch(union, compiled.start);
-            self.patch(union, empty);
-            prev_end = compiled.end;
-        }
-        self.patch(prev_end, empty);
-        Ok(ThompsonRef { start: prefix.start, end: empty })
-    }
-
-    fn c_at_least(
-        &self,
-        expr: &Hir,
-        greedy: bool,
-        n: u32,
-    ) -> Result<ThompsonRef> {
-        if n == 0 {
-            let union = if greedy {
-                self.add_union()
-            } else {
-                self.add_reverse_union()
-            };
-            let compiled = self.c(expr)?;
-            self.patch(union, compiled.start);
-            self.patch(compiled.end, union);
-            Ok(ThompsonRef { start: union, end: union })
-        } else if n == 1 {
-            let compiled = self.c(expr)?;
-            let union = if greedy {
-                self.add_union()
-            } else {
-                self.add_reverse_union()
-            };
-            self.patch(compiled.end, union);
-            self.patch(union, compiled.start);
-            Ok(ThompsonRef { start: compiled.start, end: union })
-        } else {
-            let prefix = self.c_exactly(expr, n - 1)?;
-            let last = self.c(expr)?;
-            let union = if greedy {
-                self.add_union()
-            } else {
-                self.add_reverse_union()
-            };
-            self.patch(prefix.end, last.start);
-            self.patch(last.end, union);
-            self.patch(union, last.start);
-            Ok(ThompsonRef { start: prefix.start, end: union })
-        }
-    }
-
-    fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
-        let union =
-            if greedy { self.add_union() } else { self.add_reverse_union() };
-        let compiled = self.c(expr)?;
-        let empty = self.add_empty();
-        self.patch(union, compiled.start);
-        self.patch(union, empty);
-        self.patch(compiled.end, empty);
-        Ok(ThompsonRef { start: union, end: empty })
-    }
-
-    fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
-        let it = (0..n).map(|_| self.c(expr));
-        self.c_concat(it)
-    }
-
-    fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
-        let end = self.add_empty();
-        let mut trans = Vec::with_capacity(cls.ranges().len());
-        for r in cls.iter() {
-            trans.push(Transition {
-                start: r.start(),
-                end: r.end(),
-                next: end,
-            });
-        }
-        Ok(ThompsonRef { start: self.add_sparse(trans), end })
-    }
-
-    fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
-        // If all we have are ASCII ranges wrapped in a Unicode package, then
-        // there is zero reason to bring out the big guns. We can fit all ASCII
-        // ranges within a single sparse transition.
-        if cls.is_all_ascii() {
-            let end = self.add_empty();
-            let mut trans = Vec::with_capacity(cls.ranges().len());
-            for r in cls.iter() {
-                assert!(r.start() <= '\x7F');
-                assert!(r.end() <= '\x7F');
-                trans.push(Transition {
-                    start: r.start() as u8,
-                    end: r.end() as u8,
-                    next: end,
-                });
-            }
-            Ok(ThompsonRef { start: self.add_sparse(trans), end })
-        } else if self.config.reverse {
-            if !self.config.shrink {
-                // When we don't want to spend the extra time shrinking, we
-                // compile the UTF-8 automaton in reverse using something like
-                // the "naive" approach, but will attempt to re-use common
-                // suffixes.
-                self.c_unicode_class_reverse_with_suffix(cls)
-            } else {
-                // When we want to shrink our NFA for reverse UTF-8 automata,
-                // we cannot feed UTF-8 sequences directly to the UTF-8
-                // compiler, since the UTF-8 compiler requires all sequences
-                // to be lexicographically sorted. Instead, we organize our
-                // sequences into a range trie, which can then output our
-                // sequences in the correct order. Unfortunately, building the
-                // range trie is fairly expensive (but not nearly as expensive
-                // as building a DFA). Hence the reason why the 'shrink' option
-                // exists, so that this path can be toggled off.
-                let mut trie = self.trie_state.borrow_mut();
-                trie.clear();
-
-                for rng in cls.iter() {
-                    for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
-                        seq.reverse();
-                        trie.insert(seq.as_slice());
-                    }
-                }
-                let mut utf8_state = self.utf8_state.borrow_mut();
-                let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
-                trie.iter(|seq| {
-                    utf8c.add(&seq);
-                });
-                Ok(utf8c.finish())
-            }
-        } else {
-            // In the forward direction, we always shrink our UTF-8 automata
-            // because we can stream it right into the UTF-8 compiler. There
-            // is almost no downside (in either memory or time) to using this
-            // approach.
-            let mut utf8_state = self.utf8_state.borrow_mut();
-            let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
-            for rng in cls.iter() {
-                for seq in Utf8Sequences::new(rng.start(), rng.end()) {
-                    utf8c.add(seq.as_slice());
-                }
-            }
-            Ok(utf8c.finish())
-        }
-
-        // For reference, the code below is the "naive" version of compiling a
-        // UTF-8 automaton. It is deliciously simple (and works for both the
-        // forward and reverse cases), but will unfortunately produce very
-        // large NFAs. When compiling a forward automaton, the size difference
-        // can sometimes be an order of magnitude. For example, the '\w' regex
-        // will generate about ~3000 NFA states using the naive approach below,
-        // but only 283 states when using the approach above. This is because
-        // the approach above actually compiles a *minimal* (or near minimal,
-        // because of the bounded hashmap) UTF-8 automaton.
-        //
-        // The code below is kept as a reference point in order to make it
-        // easier to understand the higher level goal here.
-        /*
-        let it = cls
-            .iter()
-            .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
-            .map(|seq| {
-                let it = seq
-                    .as_slice()
-                    .iter()
-                    .map(|rng| Ok(self.c_range(rng.start, rng.end)));
-                self.c_concat(it)
-            });
-        self.c_alternation(it);
-        */
-    }
-
-    fn c_unicode_class_reverse_with_suffix(
-        &self,
-        cls: &hir::ClassUnicode,
-    ) -> Result<ThompsonRef> {
-        // N.B. It would likely be better to cache common *prefixes* in the
-        // reverse direction, but it's not quite clear how to do that. The
-        // advantage of caching suffixes is that it does give us a win, and
-        // has a very small additional overhead.
-        let mut cache = self.utf8_suffix.borrow_mut();
-        cache.clear();
-
-        let union = self.add_union();
-        let alt_end = self.add_empty();
-        for urng in cls.iter() {
-            for seq in Utf8Sequences::new(urng.start(), urng.end()) {
-                let mut end = alt_end;
-                for brng in seq.as_slice() {
-                    let key = Utf8SuffixKey {
-                        from: end,
-                        start: brng.start,
-                        end: brng.end,
-                    };
-                    let hash = cache.hash(&key);
-                    if let Some(id) = cache.get(&key, hash) {
-                        end = id;
-                        continue;
-                    }
-
-                    let compiled = self.c_range(brng.start, brng.end);
-                    self.patch(compiled.end, end);
-                    end = compiled.start;
-                    cache.set(key, hash, end);
-                }
-                self.patch(union, end);
-            }
-        }
-        Ok(ThompsonRef { start: union, end: alt_end })
-    }
-
-    fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
-        let id = self.add_range(start, end);
-        ThompsonRef { start: id, end: id }
-    }
-
-    fn c_empty(&self) -> ThompsonRef {
-        let id = self.add_empty();
-        ThompsonRef { start: id, end: id }
-    }
-
-    fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
-        self.c(&Hir::repetition(hir::Repetition {
-            kind: hir::RepetitionKind::ZeroOrMore,
-            greedy: false,
-            hir: Box::new(Hir::any(false)),
-        }))
-    }
-
-    fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
-        self.c(&Hir::repetition(hir::Repetition {
-            kind: hir::RepetitionKind::ZeroOrMore,
-            greedy: false,
-            hir: Box::new(Hir::any(true)),
-        }))
-    }
-
-    fn patch(&self, from: StateID, to: StateID) {
-        match self.states.borrow_mut()[from] {
-            CState::Empty { ref mut next } => {
-                *next = to;
-            }
-            CState::Range { ref mut range } => {
-                range.next = to;
-            }
-            CState::Sparse { .. } => {
-                panic!("cannot patch from a sparse NFA state")
-            }
-            CState::Union { ref mut alternates } => {
-                alternates.push(to);
-            }
-            CState::UnionReverse { ref mut alternates } => {
-                alternates.push(to);
-            }
-            CState::Match => {}
-        }
-    }
-
-    fn add_empty(&self) -> StateID {
-        let id = self.states.borrow().len();
-        self.states.borrow_mut().push(CState::Empty { next: 0 });
-        id
-    }
-
-    fn add_range(&self, start: u8, end: u8) -> StateID {
-        let id = self.states.borrow().len();
-        let trans = Transition { start, end, next: 0 };
-        let state = CState::Range { range: trans };
-        self.states.borrow_mut().push(state);
-        id
-    }
-
-    fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
-        if ranges.len() == 1 {
-            let id = self.states.borrow().len();
-            let state = CState::Range { range: ranges[0] };
-            self.states.borrow_mut().push(state);
-            return id;
-        }
-        let id = self.states.borrow().len();
-        let state = CState::Sparse { ranges };
-        self.states.borrow_mut().push(state);
-        id
-    }
-
-    fn add_union(&self) -> StateID {
-        let id = self.states.borrow().len();
-        let state = CState::Union { alternates: vec![] };
-        self.states.borrow_mut().push(state);
-        id
-    }
-
-    fn add_reverse_union(&self) -> StateID {
-        let id = self.states.borrow().len();
-        let state = CState::UnionReverse { alternates: vec![] };
-        self.states.borrow_mut().push(state);
-        id
-    }
-
-    fn add_match(&self) -> StateID {
-        let id = self.states.borrow().len();
-        self.states.borrow_mut().push(CState::Match);
-        id
-    }
-}
-
-#[derive(Debug)]
-struct Utf8Compiler<'a> {
-    nfac: &'a Compiler,
-    state: &'a mut Utf8State,
-    target: StateID,
-}
-
-#[derive(Clone, Debug)]
-struct Utf8State {
-    compiled: Utf8BoundedMap,
-    uncompiled: Vec<Utf8Node>,
-}
-
-#[derive(Clone, Debug)]
-struct Utf8Node {
-    trans: Vec<Transition>,
-    last: Option<Utf8LastTransition>,
-}
-
-#[derive(Clone, Debug)]
-struct Utf8LastTransition {
-    start: u8,
-    end: u8,
-}
-
-impl Utf8State {
-    fn new() -> Utf8State {
-        Utf8State { compiled: Utf8BoundedMap::new(5000), uncompiled: vec![] }
-    }
-
-    fn clear(&mut self) {
-        self.compiled.clear();
-        self.uncompiled.clear();
-    }
-}
-
-impl<'a> Utf8Compiler<'a> {
-    fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
-        let target = nfac.add_empty();
-        state.clear();
-        let mut utf8c = Utf8Compiler { nfac, state, target };
-        utf8c.add_empty();
-        utf8c
-    }
-
-    fn finish(&mut self) -> ThompsonRef {
-        self.compile_from(0);
-        let node = self.pop_root();
-        let start = self.compile(node);
-        ThompsonRef { start, end: self.target }
-    }
-
-    fn add(&mut self, ranges: &[Utf8Range]) {
-        let prefix_len = ranges
-            .iter()
-            .zip(&self.state.uncompiled)
-            .take_while(|&(range, node)| {
-                node.last.as_ref().map_or(false, |t| {
-                    (t.start, t.end) == (range.start, range.end)
-                })
-            })
-            .count();
-        assert!(prefix_len < ranges.len());
-        self.compile_from(prefix_len);
-        self.add_suffix(&ranges[prefix_len..]);
-    }
-
-    fn compile_from(&mut self, from: usize) {
-        let mut next = self.target;
-        while from + 1 < self.state.uncompiled.len() {
-            let node = self.pop_freeze(next);
-            next = self.compile(node);
-        }
-        self.top_last_freeze(next);
-    }
-
-    fn compile(&mut self, node: Vec<Transition>) -> StateID {
-        let hash = self.state.compiled.hash(&node);
-        if let Some(id) = self.state.compiled.get(&node, hash) {
-            return id;
-        }
-        let id = self.nfac.add_sparse(node.clone());
-        self.state.compiled.set(node, hash, id);
-        id
-    }
-
-    fn add_suffix(&mut self, ranges: &[Utf8Range]) {
-        assert!(!ranges.is_empty());
-        let last = self
-            .state
-            .uncompiled
-            .len()
-            .checked_sub(1)
-            .expect("non-empty nodes");
-        assert!(self.state.uncompiled[last].last.is_none());
-        self.state.uncompiled[last].last = Some(Utf8LastTransition {
-            start: ranges[0].start,
-            end: ranges[0].end,
-        });
-        for r in &ranges[1..] {
-            self.state.uncompiled.push(Utf8Node {
-                trans: vec![],
-                last: Some(Utf8LastTransition { start: r.start, end: r.end }),
-            });
-        }
-    }
-
-    fn add_empty(&mut self) {
-        self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
-    }
-
-    fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
-        let mut uncompiled = self.state.uncompiled.pop().unwrap();
-        uncompiled.set_last_transition(next);
-        uncompiled.trans
-    }
-
-    fn pop_root(&mut self) -> Vec<Transition> {
-        assert_eq!(self.state.uncompiled.len(), 1);
-        assert!(self.state.uncompiled[0].last.is_none());
-        self.state.uncompiled.pop().expect("non-empty nodes").trans
-    }
-
-    fn top_last_freeze(&mut self, next: StateID) {
-        let last = self
-            .state
-            .uncompiled
-            .len()
-            .checked_sub(1)
-            .expect("non-empty nodes");
-        self.state.uncompiled[last].set_last_transition(next);
-    }
-}
-
-impl Utf8Node {
-    fn set_last_transition(&mut self, next: StateID) {
-        if let Some(last) = self.last.take() {
-            self.trans.push(Transition {
-                start: last.start,
-                end: last.end,
-                next,
-            });
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use regex_syntax::hir::Hir;
-    use regex_syntax::ParserBuilder;
-
-    use super::{Builder, State, StateID, Transition, NFA};
-
-    fn parse(pattern: &str) -> Hir {
-        ParserBuilder::new().build().parse(pattern).unwrap()
-    }
-
-    fn build(pattern: &str) -> NFA {
-        Builder::new().anchored(true).build(&parse(pattern)).unwrap()
-    }
-
-    fn s_byte(byte: u8, next: StateID) -> State {
-        let trans = Transition { start: byte, end: byte, next };
-        State::Range { range: trans }
-    }
-
-    fn s_range(start: u8, end: u8, next: StateID) -> State {
-        let trans = Transition { start, end, next };
-        State::Range { range: trans }
-    }
-
-    fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
-        let ranges = ranges
-            .iter()
-            .map(|&(start, end, next)| Transition { start, end, next })
-            .collect();
-        State::Sparse { ranges }
-    }
-
-    fn s_union(alts: &[StateID]) -> State {
-        State::Union { alternates: alts.to_vec().into_boxed_slice() }
-    }
-
-    fn s_match() -> State {
-        State::Match
-    }
-
-    #[test]
-    fn errors() {
-        // unsupported anchors
-        assert!(Builder::new().build(&parse(r"^")).is_err());
-        assert!(Builder::new().build(&parse(r"$")).is_err());
-        assert!(Builder::new().build(&parse(r"\A")).is_err());
-        assert!(Builder::new().build(&parse(r"\z")).is_err());
-
-        // unsupported word boundaries
-        assert!(Builder::new().build(&parse(r"\b")).is_err());
-        assert!(Builder::new().build(&parse(r"\B")).is_err());
-        assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
-    }
-
-    // Test that building an unanchored NFA has an appropriate `.*?` prefix.
-    #[test]
-    fn compile_unanchored_prefix() {
-        // When the machine can only match valid UTF-8.
-        let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
-        // There should be many states since the `.` in `.*?` matches any
-        // Unicode scalar value.
-        assert_eq!(11, nfa.len());
-        assert_eq!(nfa.states[10], s_match());
-        assert_eq!(nfa.states[9], s_byte(b'a', 10));
-
-        // When the machine can match invalid UTF-8.
-        let nfa = Builder::new()
-            .anchored(false)
-            .allow_invalid_utf8(true)
-            .build(&parse(r"a"))
-            .unwrap();
-        assert_eq!(
-            nfa.states,
-            &[
-                s_union(&[2, 1]),
-                s_range(0, 255, 0),
-                s_byte(b'a', 3),
-                s_match(),
-            ]
-        );
-    }
-
-    #[test]
-    fn compile_empty() {
-        assert_eq!(build("").states, &[s_match(),]);
-    }
-
-    #[test]
-    fn compile_literal() {
-        assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
-        assert_eq!(
-            build("ab").states,
-            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
-        );
-        assert_eq!(
-            build("☃").states,
-            &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
-        );
-
-        // Check that non-UTF-8 literals work.
-        let hir = ParserBuilder::new()
-            .allow_invalid_utf8(true)
-            .build()
-            .parse(r"(?-u)\xFF")
-            .unwrap();
-        let nfa = Builder::new()
-            .anchored(true)
-            .allow_invalid_utf8(true)
-            .build(&hir)
-            .unwrap();
-        assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
-    }
-
-    #[test]
-    fn compile_class() {
-        assert_eq!(
-            build(r"[a-z]").states,
-            &[s_range(b'a', b'z', 1), s_match(),]
-        );
-        assert_eq!(
-            build(r"[x-za-c]").states,
-            &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
-        );
-        assert_eq!(
-            build(r"[\u03B1-\u03B4]").states,
-            &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
-        );
-        assert_eq!(
-            build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
-            &[
-                s_range(0xB1, 0xB4, 5),
-                s_range(0x99, 0x9E, 5),
-                s_byte(0xA4, 1),
-                s_byte(0x9F, 2),
-                s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
-                s_match(),
-            ]
-        );
-        assert_eq!(
-            build(r"[a-z☃]").states,
-            &[
-                s_byte(0x83, 3),
-                s_byte(0x98, 0),
-                s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
-                s_match(),
-            ]
-        );
-    }
-
-    #[test]
-    fn compile_repetition() {
-        assert_eq!(
-            build(r"a?").states,
-            &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
-        );
-        assert_eq!(
-            build(r"a??").states,
-            &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
-        );
-    }
-
-    #[test]
-    fn compile_group() {
-        assert_eq!(
-            build(r"ab+").states,
-            &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
-        );
-        assert_eq!(
-            build(r"(ab)").states,
-            &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
-        );
-        assert_eq!(
-            build(r"(ab)+").states,
-            &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
-        );
-    }
-
-    #[test]
-    fn compile_alternation() {
-        assert_eq!(
-            build(r"a|b").states,
-            &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
-        );
-        assert_eq!(
-            build(r"|b").states,
-            &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
-        );
-        assert_eq!(
-            build(r"a|").states,
-            &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
-        );
-    }
-}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:18:25 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-17 12:18:25 +0000
commit	5363f350887b1e5b5dd21a86f88c8af9d7fea6da (patch)
tree	35ca005eb6e0e9a1ba3bb5dbc033209ad445dc17 /vendor/regex-automata/src/nfa/compiler.rs
parent	Adding debian version 1.66.0+dfsg1-1. (diff)
download	rustc-5363f350887b1e5b5dd21a86f88c8af9d7fea6da.tar.xz rustc-5363f350887b1e5b5dd21a86f88c8af9d7fea6da.zip