summaryrefslogtreecommitdiffstats
path: root/vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs')
-rw-r--r--vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs1713
1 files changed, 1713 insertions, 0 deletions
diff --git a/vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs b/vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs
new file mode 100644
index 000000000..301194005
--- /dev/null
+++ b/vendor/regex-automata-0.2.0/src/nfa/thompson/compiler.rs
@@ -0,0 +1,1713 @@
+/*
+This module provides an NFA compiler using Thompson's construction
+algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
+graph as output. The NFA graph is structured in a way that permits it to be
+executed by a virtual machine and also used to efficiently build a DFA.
+
+The compiler deals with a slightly expanded set of NFA states that notably
+includes an empty node that has exactly one epsilon transition to the next
+state. In other words, it's a "goto" instruction if one views Thompson's NFA
+as a set of bytecode instructions. These goto instructions are removed in
+a subsequent phase before returning the NFA to the caller. The purpose of
+these empty nodes is that they make the construction algorithm substantially
+simpler to implement. We remove them before returning to the caller because
+they can represent substantial overhead when traversing the NFA graph
+(either while searching using the NFA directly or while building a DFA).
+
+In the future, it would be nice to provide a Glushkov compiler as well,
+as it would work well as a bit-parallel NFA for smaller regexes. But
+the Thompson construction is one I'm more familiar with and seems more
+straight-forward to deal with when it comes to large Unicode character
+classes.
+
+Internally, the compiler uses interior mutability to improve composition
+in the face of the borrow checker. In particular, we'd really like to be
+able to write things like this:
+
+ self.c_concat(exprs.iter().map(|e| self.c(e)))
+
+Which elegantly uses iterators to build up a sequence of compiled regex
+sub-expressions and then hands it off to the concatenating compiler
+routine. Without interior mutability, the borrow checker won't let us
+borrow `self` mutably both inside and outside the closure at the same
+time.
+*/
+
+use core::{
+ borrow::Borrow,
+ cell::{Cell, RefCell},
+ mem,
+};
+
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use regex_syntax::{
+ hir::{self, Anchor, Class, Hir, HirKind, Literal, WordBoundary},
+ utf8::{Utf8Range, Utf8Sequences},
+ ParserBuilder,
+};
+
+use crate::{
+ nfa::thompson::{
+ error::Error,
+ map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
+ range_trie::RangeTrie,
+ Look, SparseTransitions, State, Transition, NFA,
+ },
+ util::{
+ alphabet::ByteClassSet,
+ id::{IteratorIDExt, PatternID, StateID},
+ },
+};
+
+/// The configuration used for compiling a Thompson NFA from a regex pattern.
+#[derive(Clone, Copy, Debug, Default)]
+pub struct Config {
+ reverse: Option<bool>,
+ utf8: Option<bool>,
+ nfa_size_limit: Option<Option<usize>>,
+ shrink: Option<bool>,
+ captures: Option<bool>,
+ #[cfg(test)]
+ unanchored_prefix: Option<bool>,
+}
+
+impl Config {
+ /// Return a new default Thompson NFA compiler configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Reverse the NFA.
+ ///
+ /// A NFA reversal is performed by reversing all of the concatenated
+ /// sub-expressions in the original pattern, recursively. The resulting
+ /// NFA can be used to match the pattern starting from the end of a string
+ /// instead of the beginning of a string.
+ ///
+ /// Reversing the NFA is useful for building a reverse DFA, which is most
+ /// useful for finding the start of a match after its ending position has
+ /// been found.
+ ///
+ /// This is disabled by default.
+ pub fn reverse(mut self, yes: bool) -> Config {
+ self.reverse = Some(yes);
+ self
+ }
+
+ /// Whether to enable UTF-8 mode or not.
+ ///
+ /// When UTF-8 mode is enabled (which is the default), unanchored searches
+ /// will only match through valid UTF-8. If invalid UTF-8 is seen, then
+ /// an unanchored search will stop at that point. This is equivalent to
+ /// putting a `(?s:.)*?` at the start of the regex.
+ ///
+ /// When UTF-8 mode is disabled, then unanchored searches will match
+ /// through any arbitrary byte. This is equivalent to putting a
+ /// `(?s-u:.)*?` at the start of the regex.
+ ///
+ /// Generally speaking, UTF-8 mode should only be used when you know you
+ /// are searching valid UTF-8, such as a Rust `&str`. If UTF-8 mode is used
+ /// on input that is not valid UTF-8, then the regex is not likely to work
+ /// as expected.
+ ///
+ /// This is enabled by default.
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = Some(yes);
+ self
+ }
+
+ /// Sets an approximate size limit on the total heap used by the NFA being
+ /// compiled.
+ ///
+ /// This permits imposing constraints on the size of a compiled NFA. This
+ /// may be useful in contexts where the regex pattern is untrusted and one
+ /// wants to avoid using too much memory.
+ ///
+ /// This size limit does not apply to auxiliary heap used during
+ /// compilation that is not part of the built NFA.
+ ///
+ /// Note that this size limit is applied during compilation in order for
+ /// the limit to prevent too much heap from being used. However, the
+ /// implementation may use an intermediate NFA representation that is
+ /// otherwise slightly bigger than the final public form. Since the size
+ /// limit may be applied to an intermediate representation, there is not
+ /// necessarily a precise correspondence between the configured size limit
+ /// and the heap usage of the final NFA.
+ ///
+ /// There is no size limit by default.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates how Unicode mode can greatly increase the
+ /// size of the NFA.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// // 300KB isn't enough!
+ /// NFA::builder()
+ /// .configure(NFA::config().nfa_size_limit(Some(300_000)))
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 400KB probably is.
+ /// let nfa = NFA::builder()
+ /// .configure(NFA::config().nfa_size_limit(Some(400_000)))
+ /// .build(r"\w{20}")?;
+ ///
+ /// assert_eq!(nfa.pattern_len(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
+ self.nfa_size_limit = Some(bytes);
+ self
+ }
+
+ /// Apply best effort heuristics to shrink the NFA at the expense of more
+ /// time/memory.
+ ///
+ /// This is enabled by default. Generally speaking, if one is using an NFA
+ /// to compile a DFA, then the extra time used to shrink the NFA will be
+ /// more than made up for during DFA construction (potentially by a lot).
+ /// In other words, enabling this can substantially decrease the overall
+ /// amount of time it takes to build a DFA.
+ ///
+ /// The only reason to disable this if you want to compile an NFA and start
+ /// using it as quickly as possible without needing to build a DFA. e.g.,
+ /// for an NFA simulation or for a lazy DFA.
+ ///
+ /// This is enabled by default.
+ pub fn shrink(mut self, yes: bool) -> Config {
+ self.shrink = Some(yes);
+ self
+ }
+
+ /// Whether to include 'Capture' states in the NFA.
+ ///
+ /// This can only be enabled when compiling a forward NFA. This is
+ /// always disabled---with no way to override it---when the `reverse`
+ /// configuration is enabled.
+ ///
+ /// This is enabled by default.
+ pub fn captures(mut self, yes: bool) -> Config {
+ self.captures = Some(yes);
+ self
+ }
+
+ /// Whether to compile an unanchored prefix into this NFA.
+ ///
+ /// This is enabled by default. It is made available for tests only to make
+ /// it easier to unit test the output of the compiler.
+ #[cfg(test)]
+ fn unanchored_prefix(mut self, yes: bool) -> Config {
+ self.unanchored_prefix = Some(yes);
+ self
+ }
+
+ pub fn get_reverse(&self) -> bool {
+ self.reverse.unwrap_or(false)
+ }
+
+ pub fn get_utf8(&self) -> bool {
+ self.utf8.unwrap_or(true)
+ }
+
+ pub fn get_nfa_size_limit(&self) -> Option<usize> {
+ self.nfa_size_limit.unwrap_or(None)
+ }
+
+ pub fn get_shrink(&self) -> bool {
+ self.shrink.unwrap_or(true)
+ }
+
+ pub fn get_captures(&self) -> bool {
+ !self.get_reverse() && self.captures.unwrap_or(true)
+ }
+
+ fn get_unanchored_prefix(&self) -> bool {
+ #[cfg(test)]
+ {
+ self.unanchored_prefix.unwrap_or(true)
+ }
+ #[cfg(not(test))]
+ {
+ true
+ }
+ }
+
+ pub(crate) fn overwrite(self, o: Config) -> Config {
+ Config {
+ reverse: o.reverse.or(self.reverse),
+ utf8: o.utf8.or(self.utf8),
+ nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
+ shrink: o.shrink.or(self.shrink),
+ captures: o.captures.or(self.captures),
+ #[cfg(test)]
+ unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
+ }
+ }
+}
+
+/// A builder for compiling an NFA.
+#[derive(Clone, Debug)]
+pub struct Builder {
+ config: Config,
+ parser: ParserBuilder,
+}
+
+impl Builder {
+ /// Create a new NFA builder with its default configuration.
+ pub fn new() -> Builder {
+ Builder { config: Config::default(), parser: ParserBuilder::new() }
+ }
+
+ /// Compile the given regular expression into an NFA.
+ ///
+ /// If there was a problem parsing the regex, then that error is returned.
+ ///
+ /// Otherwise, if there was a problem building the NFA, then an error is
+ /// returned. The only error that can occur is if the compiled regex would
+ /// exceed the size limits configured on this builder.
+ pub fn build(&self, pattern: &str) -> Result<NFA, Error> {
+ self.build_many(&[pattern])
+ }
+
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<NFA, Error> {
+ let mut hirs = vec![];
+ for p in patterns {
+ hirs.push(
+ self.parser
+ .build()
+ .parse(p.as_ref())
+ .map_err(Error::syntax)?,
+ );
+ log!(log::trace!("parsed: {:?}", p.as_ref()));
+ }
+ self.build_many_from_hir(&hirs)
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into an NFA.
+ ///
+ /// If there was a problem building the NFA, then an error is returned. The
+ /// only error that can occur is if the compiled regex would exceed the
+ /// size limits configured on this builder.
+ pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, Error> {
+ self.build_from_hir_with(&mut Compiler::new(), expr)
+ }
+
+ pub fn build_many_from_hir<H: Borrow<Hir>>(
+ &self,
+ exprs: &[H],
+ ) -> Result<NFA, Error> {
+ self.build_many_from_hir_with(&mut Compiler::new(), exprs)
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into the NFA given using the given compiler. Callers may
+ /// prefer this over `build` if they would like to reuse allocations while
+ /// compiling many regular expressions.
+ ///
+ /// On success, the given NFA is completely overwritten with the NFA
+ /// produced by the compiler.
+ ///
+ /// If there was a problem building the NFA, then an error is returned.
+ /// The only error that can occur is if the compiled regex would exceed
+ /// the size limits configured on this builder. When an error is returned,
+ /// the contents of `nfa` are unspecified and should not be relied upon.
+ /// However, it can still be reused in subsequent calls to this method.
+ fn build_from_hir_with(
+ &self,
+ compiler: &mut Compiler,
+ expr: &Hir,
+ ) -> Result<NFA, Error> {
+ self.build_many_from_hir_with(compiler, &[expr])
+ }
+
+ fn build_many_from_hir_with<H: Borrow<Hir>>(
+ &self,
+ compiler: &mut Compiler,
+ exprs: &[H],
+ ) -> Result<NFA, Error> {
+ compiler.configure(self.config);
+ compiler.compile(exprs)
+ }
+
+ /// Apply the given NFA configuration options to this builder.
+ pub fn configure(&mut self, config: Config) -> &mut Builder {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`SyntaxConfig`](../../struct.SyntaxConfig.html).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// This syntax configuration generally only applies when an NFA is built
+ /// directly from a pattern string. If an NFA is built from an HIR, then
+ /// all syntax settings are ignored.
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::SyntaxConfig,
+ ) -> &mut Builder {
+ config.apply(&mut self.parser);
+ self
+ }
+}
+
+/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
+/// construction. Namely, this compiler permits epsilon transitions between
+/// states.
+#[derive(Clone, Debug)]
+pub struct Compiler {
+ /// The configuration from the builder.
+ config: Config,
+ /// The final NFA that is built.
+ ///
+ /// Parts of this NFA are constructed during compilation, but the actual
+ /// states aren't added until a final "finish" step. This is because the
+ /// states constructed during compilation have unconditional epsilon
+ /// transitions, which makes the logic of compilation much simpler. The
+ /// "finish" step removes these unconditional epsilon transitions and must
+ /// therefore remap all of the transition state IDs.
+ nfa: RefCell<NFA>,
+ /// The set of compiled NFA states. Once a state is compiled, it is
+ /// assigned a state ID equivalent to its index in this list. Subsequent
+ /// compilation can modify previous states by adding new transitions.
+ states: RefCell<Vec<CState>>,
+ /// State used for compiling character classes to UTF-8 byte automata.
+ /// State is not retained between character class compilations. This just
+ /// serves to amortize allocation to the extent possible.
+ utf8_state: RefCell<Utf8State>,
+ /// State used for arranging character classes in reverse into a trie.
+ trie_state: RefCell<RangeTrie>,
+ /// State used for caching common suffixes when compiling reverse UTF-8
+ /// automata (for Unicode character classes).
+ utf8_suffix: RefCell<Utf8SuffixMap>,
+ /// A map used to re-map state IDs when translating the compiler's internal
+ /// NFA state representation to the external NFA representation.
+ remap: RefCell<Vec<StateID>>,
+ /// A set of compiler internal state IDs that correspond to states that are
+ /// exclusively epsilon transitions, i.e., goto instructions, combined with
+ /// the state that they point to. This is used to record said states while
+ /// transforming the compiler's internal NFA representation to the external
+ /// form.
+ empties: RefCell<Vec<(StateID, StateID)>>,
+ /// The total memory used by each of the 'CState's in 'states'. This only
+ /// includes heap usage by each state, and not the size of the state
+ /// itself.
+ memory_cstates: Cell<usize>,
+}
+
+/// A compiler intermediate state representation for an NFA that is only used
+/// during compilation. Once compilation is done, `CState`s are converted
+/// to `State`s (defined in the parent module), which have a much simpler
+/// representation.
+#[derive(Clone, Debug, Eq, PartialEq)]
+enum CState {
+ /// An empty state whose only purpose is to forward the automaton to
+ /// another state via en epsilon transition. These are useful during
+ /// compilation but are otherwise removed at the end.
+ Empty {
+ next: StateID,
+ },
+ /// An empty state that records a capture location.
+ ///
+ /// From the perspective of finite automata, this is precisely equivalent
+ /// to 'Empty', but serves the purpose of instructing NFA simulations to
+ /// record additional state when the finite state machine passes through
+ /// this epsilon transition.
+ ///
+ /// These transitions are treated as epsilon transitions with no additional
+ /// effects in DFAs.
+ ///
+ /// 'slot' in this context refers to the specific capture group offset that
+ /// is being recorded. Each capturing group has two slots corresponding to
+ /// the start and end of the matching portion of that group.
+ CaptureStart {
+ next: StateID,
+ capture_index: u32,
+ name: Option<Arc<str>>,
+ },
+ CaptureEnd {
+ next: StateID,
+ capture_index: u32,
+ },
+ /// A state that only transitions to `next` if the current input byte is
+ /// in the range `[start, end]` (inclusive on both ends).
+ Range {
+ range: Transition,
+ },
+ /// A state with possibly many transitions, represented in a sparse
+ /// fashion. Transitions are ordered lexicographically by input range.
+ /// As such, this may only be used when every transition has equal
+ /// priority. (In practice, this is only used for encoding large UTF-8
+ /// automata.) In contrast, a `Union` state has each alternate in order
+ /// of priority. Priority is used to implement greedy matching and also
+ /// alternations themselves, e.g., `abc|a` where `abc` has priority over
+ /// `a`.
+ ///
+ /// To clarify, it is possible to remove `Sparse` and represent all things
+ /// that `Sparse` is used for via `Union`. But this creates a more bloated
+ /// NFA with more epsilon transitions than is necessary in the special case
+ /// of character classes.
+ Sparse {
+ ranges: Vec<Transition>,
+ },
+ /// A conditional epsilon transition satisfied via some sort of
+ /// look-around.
+ Look {
+ look: Look,
+ next: StateID,
+ },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via earlier transitions
+ /// are preferred over later transitions.
+ Union {
+ alternates: Vec<StateID>,
+ },
+ /// An alternation such that there exists an epsilon transition to all
+ /// states in `alternates`, where matches found via later transitions are
+ /// preferred over earlier transitions.
+ ///
+ /// This "reverse" state exists for convenience during compilation that
+ /// permits easy construction of non-greedy combinations of NFA states. At
+ /// the end of compilation, Union and UnionReverse states are merged into
+ /// one Union type of state, where the latter has its epsilon transitions
+ /// reversed to reflect the priority inversion.
+ ///
+ /// The "convenience" here arises from the fact that as new states are
+ /// added to the list of `alternates`, we would like that add operation
+ /// to be amortized constant time. But if we used a `Union`, we'd need to
+ /// prepend the state, which takes O(n) time. There are other approaches we
+ /// could use to solve this, but this seems simple enough.
+ UnionReverse {
+ alternates: Vec<StateID>,
+ },
+ /// A match state. There is at most one such occurrence of this state in
+ /// an NFA for each pattern compiled into the NFA. At time of writing, a
+ /// match state is always produced for every pattern given, but in theory,
+ /// if a pattern can never lead to a match, then the match state could be
+ /// omitted.
+ ///
+ /// `id` refers to the ID of the pattern itself, which corresponds to the
+ /// pattern's index (starting at 0). `start_id` refers to the anchored
+ /// NFA starting state corresponding to this pattern.
+ Match {
+ pattern_id: PatternID,
+ start_id: StateID,
+ },
+}
+
+/// A value that represents the result of compiling a sub-expression of a
+/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
+/// has an initial state at `start` and a final state at `end`.
+#[derive(Clone, Copy, Debug)]
+pub struct ThompsonRef {
+ start: StateID,
+ end: StateID,
+}
+
+impl Compiler {
+ /// Create a new compiler.
+ pub fn new() -> Compiler {
+ Compiler {
+ config: Config::default(),
+ nfa: RefCell::new(NFA::empty()),
+ states: RefCell::new(vec![]),
+ utf8_state: RefCell::new(Utf8State::new()),
+ trie_state: RefCell::new(RangeTrie::new()),
+ utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
+ remap: RefCell::new(vec![]),
+ empties: RefCell::new(vec![]),
+ memory_cstates: Cell::new(0),
+ }
+ }
+
+ /// Configure and prepare this compiler from the builder's knobs.
+ ///
+ /// The compiler is must always reconfigured by the builder before using it
+ /// to build an NFA. Namely, this will also clear any latent state in the
+ /// compiler used during previous compilations.
+ fn configure(&mut self, config: Config) {
+ self.config = config;
+ self.nfa.borrow_mut().clear();
+ self.states.borrow_mut().clear();
+ self.memory_cstates.set(0);
+ // We don't need to clear anything else since they are cleared on
+ // their own and only when they are used.
+ }
+
+ /// Convert the current intermediate NFA to its final compiled form.
+ fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, Error> {
+ if exprs.is_empty() {
+ return Ok(NFA::never_match());
+ }
+ if exprs.len() > PatternID::LIMIT {
+ return Err(Error::too_many_patterns(exprs.len()));
+ }
+
+ // We always add an unanchored prefix unless we were specifically told
+ // not to (for tests only), or if we know that the regex is anchored
+ // for all matches. When an unanchored prefix is not added, then the
+ // NFA's anchored and unanchored start states are equivalent.
+ let all_anchored =
+ exprs.iter().all(|e| e.borrow().is_anchored_start());
+ let anchored = !self.config.get_unanchored_prefix() || all_anchored;
+ let unanchored_prefix = if anchored {
+ self.c_empty()?
+ } else {
+ if self.config.get_utf8() {
+ self.c_unanchored_prefix_valid_utf8()?
+ } else {
+ self.c_unanchored_prefix_invalid_utf8()?
+ }
+ };
+
+ let compiled = self.c_alternation(
+ exprs.iter().with_pattern_ids().map(|(pid, e)| {
+ let group_kind = hir::GroupKind::CaptureIndex(0);
+ let one = self.c_group(&group_kind, e.borrow())?;
+ let match_state_id = self.add_match(pid, one.start)?;
+ self.patch(one.end, match_state_id)?;
+ Ok(ThompsonRef { start: one.start, end: match_state_id })
+ }),
+ )?;
+ self.patch(unanchored_prefix.end, compiled.start)?;
+ self.finish(compiled.start, unanchored_prefix.start)?;
+ Ok(self.nfa.replace(NFA::empty()))
+ }
+
+ /// Finishes the compilation process and populates the NFA attached to this
+ /// compiler with the final graph.
+ fn finish(
+ &self,
+ start_anchored: StateID,
+ start_unanchored: StateID,
+ ) -> Result<(), Error> {
+ trace!(
+ "intermediate NFA compilation complete, \
+ intermediate NFA size: {} states, {} bytes on heap",
+ self.states.borrow().len(),
+ self.nfa_memory_usage(),
+ );
+ let mut nfa = self.nfa.borrow_mut();
+ let mut bstates = self.states.borrow_mut();
+ let mut remap = self.remap.borrow_mut();
+ let mut empties = self.empties.borrow_mut();
+ remap.resize(bstates.len(), StateID::ZERO);
+ empties.clear();
+
+ // The idea here is to convert our intermediate states to their final
+ // form. The only real complexity here is the process of converting
+ // transitions, which are expressed in terms of state IDs. The new
+ // set of states will be smaller because of partial epsilon removal,
+ // so the state IDs will not be the same.
+ for (sid, bstate) in bstates.iter_mut().with_state_ids() {
+ match *bstate {
+ CState::Empty { next } => {
+ // Since we're removing empty states, we need to handle
+ // them later since we don't yet know which new state this
+ // empty state will be mapped to.
+ empties.push((sid, next));
+ }
+ CState::CaptureStart { next, capture_index, ref name } => {
+ // We can't remove this empty state because of the side
+ // effect of capturing an offset for this capture slot.
+ remap[sid] = nfa.add_capture_start(
+ next,
+ capture_index,
+ name.clone(),
+ )?;
+ }
+ CState::CaptureEnd { next, capture_index } => {
+ // We can't remove this empty state because of the side
+ // effect of capturing an offset for this capture slot.
+ remap[sid] = nfa.add_capture_end(next, capture_index)?;
+ }
+ CState::Range { range } => {
+ remap[sid] = nfa.add_range(range)?;
+ }
+ CState::Sparse { ref mut ranges } => {
+ let ranges =
+ mem::replace(ranges, vec![]).into_boxed_slice();
+ remap[sid] =
+ nfa.add_sparse(SparseTransitions { ranges })?;
+ }
+ CState::Look { look, next } => {
+ remap[sid] = nfa.add_look(next, look)?;
+ }
+ CState::Union { ref mut alternates } => {
+ let alternates =
+ mem::replace(alternates, vec![]).into_boxed_slice();
+ remap[sid] = nfa.add_union(alternates)?;
+ }
+ CState::UnionReverse { ref mut alternates } => {
+ let mut alternates =
+ mem::replace(alternates, vec![]).into_boxed_slice();
+ alternates.reverse();
+ remap[sid] = nfa.add_union(alternates)?;
+ }
+ CState::Match { start_id, .. } => {
+ remap[sid] = nfa.add_match()?;
+ nfa.finish_pattern(start_id)?;
+ }
+ }
+ }
+ for &(empty_id, mut empty_next) in empties.iter() {
+ // empty states can point to other empty states, forming a chain.
+ // So we must follow the chain until the end, which must end at
+ // a non-empty state, and therefore, a state that is correctly
+ // remapped. We are guaranteed to terminate because our compiler
+ // never builds a loop among only empty states.
+ while let CState::Empty { next } = bstates[empty_next] {
+ empty_next = next;
+ }
+ remap[empty_id] = remap[empty_next];
+ }
+ nfa.set_start_anchored(start_anchored);
+ nfa.set_start_unanchored(start_unanchored);
+ nfa.remap(&remap);
+ trace!(
+ "final NFA (reverse? {:?}) compilation complete, \
+ final NFA size: {} states, {} bytes on heap",
+ self.config.get_reverse(),
+ nfa.states().len(),
+ nfa.memory_usage(),
+ );
+ Ok(())
+ }
+
+ fn c(&self, expr: &Hir) -> Result<ThompsonRef, Error> {
+ match *expr.kind() {
+ HirKind::Empty => self.c_empty(),
+ HirKind::Literal(Literal::Unicode(ch)) => self.c_char(ch),
+ HirKind::Literal(Literal::Byte(b)) => self.c_range(b, b),
+ HirKind::Class(Class::Bytes(ref c)) => self.c_byte_class(c),
+ HirKind::Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
+ HirKind::Anchor(ref anchor) => self.c_anchor(anchor),
+ HirKind::WordBoundary(ref wb) => self.c_word_boundary(wb),
+ HirKind::Repetition(ref rep) => self.c_repetition(rep),
+ HirKind::Group(ref group) => self.c_group(&group.kind, &group.hir),
+ HirKind::Concat(ref es) => {
+ self.c_concat(es.iter().map(|e| self.c(e)))
+ }
+ HirKind::Alternation(ref es) => {
+ self.c_alternation(es.iter().map(|e| self.c(e)))
+ }
+ }
+ }
+
+ fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
+ where
+ I: DoubleEndedIterator<Item = Result<ThompsonRef, Error>>,
+ {
+ let first = if self.is_reverse() { it.next_back() } else { it.next() };
+ let ThompsonRef { start, mut end } = match first {
+ Some(result) => result?,
+ None => return self.c_empty(),
+ };
+ loop {
+ let next =
+ if self.is_reverse() { it.next_back() } else { it.next() };
+ let compiled = match next {
+ Some(result) => result?,
+ None => break,
+ };
+ self.patch(end, compiled.start)?;
+ end = compiled.end;
+ }
+ Ok(ThompsonRef { start, end })
+ }
+
+ fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error>
+ where
+ I: Iterator<Item = Result<ThompsonRef, Error>>,
+ {
+ let first = it.next().expect("alternations must be non-empty")?;
+ let second = match it.next() {
+ None => return Ok(first),
+ Some(result) => result?,
+ };
+
+ let union = self.add_union()?;
+ let end = self.add_empty()?;
+ self.patch(union, first.start)?;
+ self.patch(first.end, end)?;
+ self.patch(union, second.start)?;
+ self.patch(second.end, end)?;
+ for result in it {
+ let compiled = result?;
+ self.patch(union, compiled.start)?;
+ self.patch(compiled.end, end)?;
+ }
+ Ok(ThompsonRef { start: union, end })
+ }
+
+ fn c_group(
+ &self,
+ kind: &hir::GroupKind,
+ expr: &Hir,
+ ) -> Result<ThompsonRef, Error> {
+ if !self.config.get_captures() {
+ return self.c(expr);
+ }
+ let (capi, name) = match *kind {
+ hir::GroupKind::NonCapturing => return self.c(expr),
+ hir::GroupKind::CaptureIndex(index) => (index, None),
+ hir::GroupKind::CaptureName { ref name, index } => {
+ (index, Some(Arc::from(&**name)))
+ }
+ };
+
+ let start = self.add_capture_start(capi, name)?;
+ let inner = self.c(expr)?;
+ let end = self.add_capture_end(capi)?;
+
+ self.patch(start, inner.start)?;
+ self.patch(inner.end, end)?;
+ Ok(ThompsonRef { start, end })
+ }
+
+ fn c_repetition(
+ &self,
+ rep: &hir::Repetition,
+ ) -> Result<ThompsonRef, Error> {
+ match rep.kind {
+ hir::RepetitionKind::ZeroOrOne => {
+ self.c_zero_or_one(&rep.hir, rep.greedy)
+ }
+ hir::RepetitionKind::ZeroOrMore => {
+ self.c_at_least(&rep.hir, rep.greedy, 0)
+ }
+ hir::RepetitionKind::OneOrMore => {
+ self.c_at_least(&rep.hir, rep.greedy, 1)
+ }
+ hir::RepetitionKind::Range(ref rng) => match *rng {
+ hir::RepetitionRange::Exactly(count) => {
+ self.c_exactly(&rep.hir, count)
+ }
+ hir::RepetitionRange::AtLeast(m) => {
+ self.c_at_least(&rep.hir, rep.greedy, m)
+ }
+ hir::RepetitionRange::Bounded(min, max) => {
+ self.c_bounded(&rep.hir, rep.greedy, min, max)
+ }
+ },
+ }
+ }
+
+ fn c_bounded(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ max: u32,
+ ) -> Result<ThompsonRef, Error> {
+ let prefix = self.c_exactly(expr, min)?;
+ if min == max {
+ return Ok(prefix);
+ }
+
+ // It is tempting here to compile the rest here as a concatenation
+ // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
+ // were `aaa?a?a?`. The problem here is that it leads to this program:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: union(03, 04)
+ // 000003: 61 => 04
+ // 000004: union(05, 06)
+ // 000005: 61 => 06
+ // 000006: union(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // And effectively, once you hit state 2, the epsilon closure will
+ // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
+ // to instead compile it like so:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: union(03, 08)
+ // 000003: 61 => 04
+ // 000004: union(05, 08)
+ // 000005: 61 => 06
+ // 000006: union(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // So that the epsilon closure of state 2 is now just 3 and 8.
+ let empty = self.add_empty()?;
+ let mut prev_end = prefix.end;
+ for _ in min..max {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ let compiled = self.c(expr)?;
+ self.patch(prev_end, union)?;
+ self.patch(union, compiled.start)?;
+ self.patch(union, empty)?;
+ prev_end = compiled.end;
+ }
+ self.patch(prev_end, empty)?;
+ Ok(ThompsonRef { start: prefix.start, end: empty })
+ }
+
+ fn c_at_least(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ n: u32,
+ ) -> Result<ThompsonRef, Error> {
+ if n == 0 {
+ // When the expression cannot match the empty string, then we
+ // can get away with something much simpler: just one 'alt'
+ // instruction that optionally repeats itself. But if the expr
+ // can match the empty string... see below.
+ if !expr.is_match_empty() {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ let compiled = self.c(expr)?;
+ self.patch(union, compiled.start)?;
+ self.patch(compiled.end, union)?;
+ return Ok(ThompsonRef { start: union, end: union });
+ }
+
+ // What's going on here? Shouldn't x* be simpler than this? It
+ // turns out that when implementing leftmost-first (Perl-like)
+ // match semantics, x* results in an incorrect preference order
+ // when computing the transitive closure of states if and only if
+ // 'x' can match the empty string. So instead, we compile x* as
+ // (x+)?, which preserves the correct preference order.
+ //
+ // See: https://github.com/rust-lang/regex/issues/779
+ let compiled = self.c(expr)?;
+ let plus = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ self.patch(compiled.end, plus)?;
+ self.patch(plus, compiled.start)?;
+
+ let question = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ let empty = self.add_empty()?;
+ self.patch(question, compiled.start)?;
+ self.patch(question, empty)?;
+ self.patch(plus, empty)?;
+ Ok(ThompsonRef { start: question, end: empty })
+ } else if n == 1 {
+ let compiled = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ self.patch(compiled.end, union)?;
+ self.patch(union, compiled.start)?;
+ Ok(ThompsonRef { start: compiled.start, end: union })
+ } else {
+ let prefix = self.c_exactly(expr, n - 1)?;
+ let last = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_reverse_union()
+ }?;
+ self.patch(prefix.end, last.start)?;
+ self.patch(last.end, union)?;
+ self.patch(union, last.start)?;
+ Ok(ThompsonRef { start: prefix.start, end: union })
+ }
+ }
+
+ fn c_zero_or_one(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> Result<ThompsonRef, Error> {
+ let union =
+ if greedy { self.add_union() } else { self.add_reverse_union() }?;
+ let compiled = self.c(expr)?;
+ let empty = self.add_empty()?;
+ self.patch(union, compiled.start)?;
+ self.patch(union, empty)?;
+ self.patch(compiled.end, empty)?;
+ Ok(ThompsonRef { start: union, end: empty })
+ }
+
+ fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef, Error> {
+ let it = (0..n).map(|_| self.c(expr));
+ self.c_concat(it)
+ }
+
+ fn c_byte_class(
+ &self,
+ cls: &hir::ClassBytes,
+ ) -> Result<ThompsonRef, Error> {
+ let end = self.add_empty()?;
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ trans.push(Transition {
+ start: r.start(),
+ end: r.end(),
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
+ }
+
+ fn c_unicode_class(
+ &self,
+ cls: &hir::ClassUnicode,
+ ) -> Result<ThompsonRef, Error> {
+ // If all we have are ASCII ranges wrapped in a Unicode package, then
+ // there is zero reason to bring out the big guns. We can fit all ASCII
+ // ranges within a single sparse state.
+ if cls.is_all_ascii() {
+ let end = self.add_empty()?;
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ assert!(r.start() <= '\x7F');
+ assert!(r.end() <= '\x7F');
+ trans.push(Transition {
+ start: r.start() as u8,
+ end: r.end() as u8,
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
+ } else if self.is_reverse() {
+ if !self.config.get_shrink() {
+ // When we don't want to spend the extra time shrinking, we
+ // compile the UTF-8 automaton in reverse using something like
+ // the "naive" approach, but will attempt to re-use common
+ // suffixes.
+ self.c_unicode_class_reverse_with_suffix(cls)
+ } else {
+ // When we want to shrink our NFA for reverse UTF-8 automata,
+ // we cannot feed UTF-8 sequences directly to the UTF-8
+ // compiler, since the UTF-8 compiler requires all sequences
+ // to be lexicographically sorted. Instead, we organize our
+ // sequences into a range trie, which can then output our
+ // sequences in the correct order. Unfortunately, building the
+ // range trie is fairly expensive (but not nearly as expensive
+ // as building a DFA). Hence the reason why the 'shrink' option
+ // exists, so that this path can be toggled off. For example,
+ // we might want to turn this off if we know we won't be
+ // compiling a DFA.
+ let mut trie = self.trie_state.borrow_mut();
+ trie.clear();
+
+ for rng in cls.iter() {
+ for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ seq.reverse();
+ trie.insert(seq.as_slice());
+ }
+ }
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
+ trie.iter(|seq| {
+ utf8c.add(&seq)?;
+ Ok(())
+ })?;
+ utf8c.finish()
+ }
+ } else {
+ // In the forward direction, we always shrink our UTF-8 automata
+ // because we can stream it right into the UTF-8 compiler. There
+ // is almost no downside (in either memory or time) to using this
+ // approach.
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state)?;
+ for rng in cls.iter() {
+ for seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ utf8c.add(seq.as_slice())?;
+ }
+ }
+ utf8c.finish()
+ }
+
+ // For reference, the code below is the "naive" version of compiling a
+ // UTF-8 automaton. It is deliciously simple (and works for both the
+ // forward and reverse cases), but will unfortunately produce very
+ // large NFAs. When compiling a forward automaton, the size difference
+ // can sometimes be an order of magnitude. For example, the '\w' regex
+ // will generate about ~3000 NFA states using the naive approach below,
+ // but only 283 states when using the approach above. This is because
+ // the approach above actually compiles a *minimal* (or near minimal,
+ // because of the bounded hashmap for reusing equivalent states) UTF-8
+ // automaton.
+ //
+ // The code below is kept as a reference point in order to make it
+ // easier to understand the higher level goal here. Although, it will
+ // almost certainly bit-rot, so keep that in mind.
+ /*
+ let it = cls
+ .iter()
+ .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
+ .map(|seq| {
+ let it = seq
+ .as_slice()
+ .iter()
+ .map(|rng| self.c_range(rng.start, rng.end));
+ self.c_concat(it)
+ });
+ self.c_alternation(it)
+ */
+ }
+
+ fn c_unicode_class_reverse_with_suffix(
+ &self,
+ cls: &hir::ClassUnicode,
+ ) -> Result<ThompsonRef, Error> {
+ // N.B. It would likely be better to cache common *prefixes* in the
+ // reverse direction, but it's not quite clear how to do that. The
+ // advantage of caching suffixes is that it does give us a win, and
+ // has a very small additional overhead.
+ let mut cache = self.utf8_suffix.borrow_mut();
+ cache.clear();
+
+ let union = self.add_union()?;
+ let alt_end = self.add_empty()?;
+ for urng in cls.iter() {
+ for seq in Utf8Sequences::new(urng.start(), urng.end()) {
+ let mut end = alt_end;
+ for brng in seq.as_slice() {
+ let key = Utf8SuffixKey {
+ from: end,
+ start: brng.start,
+ end: brng.end,
+ };
+ let hash = cache.hash(&key);
+ if let Some(id) = cache.get(&key, hash) {
+ end = id;
+ continue;
+ }
+
+ let compiled = self.c_range(brng.start, brng.end)?;
+ self.patch(compiled.end, end)?;
+ end = compiled.start;
+ cache.set(key, hash, end);
+ }
+ self.patch(union, end)?;
+ }
+ }
+ Ok(ThompsonRef { start: union, end: alt_end })
+ }
+
+ fn c_anchor(&self, anchor: &Anchor) -> Result<ThompsonRef, Error> {
+ let look = match *anchor {
+ Anchor::StartLine => Look::StartLine,
+ Anchor::EndLine => Look::EndLine,
+ Anchor::StartText => Look::StartText,
+ Anchor::EndText => Look::EndText,
+ };
+ let id = self.add_look(look)?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ fn c_word_boundary(
+ &self,
+ wb: &WordBoundary,
+ ) -> Result<ThompsonRef, Error> {
+ let look = match *wb {
+ WordBoundary::Unicode => Look::WordBoundaryUnicode,
+ WordBoundary::UnicodeNegate => Look::WordBoundaryUnicodeNegate,
+ WordBoundary::Ascii => Look::WordBoundaryAscii,
+ WordBoundary::AsciiNegate => Look::WordBoundaryAsciiNegate,
+ };
+ let id = self.add_look(look)?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> {
+ let mut buf = [0; 4];
+ let it = ch
+ .encode_utf8(&mut buf)
+ .as_bytes()
+ .iter()
+ .map(|&b| self.c_range(b, b));
+ self.c_concat(it)
+ }
+
+ fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, Error> {
+ let id = self.add_range(start, end)?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ fn c_empty(&self) -> Result<ThompsonRef, Error> {
+ let id = self.add_empty()?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef, Error> {
+ self.c_at_least(&Hir::any(false), false, 0)
+ }
+
+ fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef, Error> {
+ self.c_at_least(&Hir::any(true), false, 0)
+ }
+
+ fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> {
+ let old_memory_cstates = self.memory_cstates.get();
+ match self.states.borrow_mut()[from] {
+ CState::Empty { ref mut next } => {
+ *next = to;
+ }
+ CState::Range { ref mut range } => {
+ range.next = to;
+ }
+ CState::Sparse { .. } => {
+ panic!("cannot patch from a sparse NFA state")
+ }
+ CState::Look { ref mut next, .. } => {
+ *next = to;
+ }
+ CState::Union { ref mut alternates } => {
+ alternates.push(to);
+ self.memory_cstates
+ .set(old_memory_cstates + mem::size_of::<StateID>());
+ }
+ CState::UnionReverse { ref mut alternates } => {
+ alternates.push(to);
+ self.memory_cstates
+ .set(old_memory_cstates + mem::size_of::<StateID>());
+ }
+ CState::CaptureStart { ref mut next, .. } => {
+ *next = to;
+ }
+ CState::CaptureEnd { ref mut next, .. } => {
+ *next = to;
+ }
+ CState::Match { .. } => {}
+ }
+ if old_memory_cstates != self.memory_cstates.get() {
+ self.check_nfa_size_limit()?;
+ }
+ Ok(())
+ }
+
+ fn add_empty(&self) -> Result<StateID, Error> {
+ self.add_state(CState::Empty { next: StateID::ZERO })
+ }
+
+ fn add_capture_start(
+ &self,
+ capture_index: u32,
+ name: Option<Arc<str>>,
+ ) -> Result<StateID, Error> {
+ self.add_state(CState::CaptureStart {
+ next: StateID::ZERO,
+ capture_index,
+ name,
+ })
+ }
+
+ fn add_capture_end(&self, capture_index: u32) -> Result<StateID, Error> {
+ self.add_state(CState::CaptureEnd {
+ next: StateID::ZERO,
+ capture_index,
+ })
+ }
+
+ fn add_range(&self, start: u8, end: u8) -> Result<StateID, Error> {
+ let trans = Transition { start, end, next: StateID::ZERO };
+ self.add_state(CState::Range { range: trans })
+ }
+
+ fn add_sparse(&self, ranges: Vec<Transition>) -> Result<StateID, Error> {
+ if ranges.len() == 1 {
+ self.add_state(CState::Range { range: ranges[0] })
+ } else {
+ self.add_state(CState::Sparse { ranges })
+ }
+ }
+
+ fn add_look(&self, mut look: Look) -> Result<StateID, Error> {
+ if self.is_reverse() {
+ look = look.reversed();
+ }
+ self.add_state(CState::Look { look, next: StateID::ZERO })
+ }
+
+ fn add_union(&self) -> Result<StateID, Error> {
+ self.add_state(CState::Union { alternates: vec![] })
+ }
+
+ fn add_reverse_union(&self) -> Result<StateID, Error> {
+ self.add_state(CState::UnionReverse { alternates: vec![] })
+ }
+
+ fn add_match(
+ &self,
+ pattern_id: PatternID,
+ start_id: StateID,
+ ) -> Result<StateID, Error> {
+ self.add_state(CState::Match { pattern_id, start_id })
+ }
+
+ fn add_state(&self, state: CState) -> Result<StateID, Error> {
+ let mut states = self.states.borrow_mut();
+ let id = StateID::new(states.len())
+ .map_err(|_| Error::too_many_states(states.len()))?;
+ self.memory_cstates
+ .set(self.memory_cstates.get() + state.memory_usage());
+ states.push(state);
+ // If we don't explicitly drop this, then 'nfa_memory_usage' will also
+ // try to borrow it when we check the size limit and hit an error.
+ drop(states);
+ self.check_nfa_size_limit()?;
+ Ok(id)
+ }
+
+ fn is_reverse(&self) -> bool {
+ self.config.get_reverse()
+ }
+
+ /// If an NFA size limit was set, this checks that the NFA compiled so far
+ /// fits within that limit. If so, then nothing is returned. Otherwise, an
+ /// error is returned.
+ ///
+ /// This should be called after increasing the heap usage of the
+ /// intermediate NFA.
+ ///
+ /// Note that this borrows 'self.states', so callers should ensure there is
+ /// no mutable borrow of it outstanding.
+ fn check_nfa_size_limit(&self) -> Result<(), Error> {
+ if let Some(limit) = self.config.get_nfa_size_limit() {
+ if self.nfa_memory_usage() > limit {
+ return Err(Error::exceeded_size_limit(limit));
+ }
+ }
+ Ok(())
+ }
+
+ /// Returns the heap memory usage, in bytes, of the NFA compiled so far.
+ ///
+ /// Note that this is an approximation of how big the final NFA will be.
+ /// In practice, the final NFA will likely be a bit smaller since it uses
+ /// things like `Box<[T]>` instead of `Vec<T>`.
+ fn nfa_memory_usage(&self) -> usize {
+ self.states.borrow().len() * mem::size_of::<CState>()
+ + self.memory_cstates.get()
+ }
+}
+
+impl CState {
+ fn memory_usage(&self) -> usize {
+ match *self {
+ CState::Empty { .. }
+ | CState::Range { .. }
+ | CState::Look { .. }
+ | CState::CaptureStart { .. }
+ | CState::CaptureEnd { .. }
+ | CState::Match { .. } => 0,
+ CState::Sparse { ref ranges } => {
+ ranges.len() * mem::size_of::<Transition>()
+ }
+ CState::Union { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ CState::UnionReverse { ref alternates } => {
+ alternates.len() * mem::size_of::<StateID>()
+ }
+ }
+ }
+}
+
+#[derive(Debug)]
+struct Utf8Compiler<'a> {
+ nfac: &'a Compiler,
+ state: &'a mut Utf8State,
+ target: StateID,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8State {
+ compiled: Utf8BoundedMap,
+ uncompiled: Vec<Utf8Node>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8Node {
+ trans: Vec<Transition>,
+ last: Option<Utf8LastTransition>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8LastTransition {
+ start: u8,
+ end: u8,
+}
+
+impl Utf8State {
+ fn new() -> Utf8State {
+ Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
+ }
+
+ fn clear(&mut self) {
+ self.compiled.clear();
+ self.uncompiled.clear();
+ }
+}
+
+impl<'a> Utf8Compiler<'a> {
+ fn new(
+ nfac: &'a Compiler,
+ state: &'a mut Utf8State,
+ ) -> Result<Utf8Compiler<'a>, Error> {
+ let target = nfac.add_empty()?;
+ state.clear();
+ let mut utf8c = Utf8Compiler { nfac, state, target };
+ utf8c.add_empty();
+ Ok(utf8c)
+ }
+
+ fn finish(&mut self) -> Result<ThompsonRef, Error> {
+ self.compile_from(0)?;
+ let node = self.pop_root();
+ let start = self.compile(node)?;
+ Ok(ThompsonRef { start, end: self.target })
+ }
+
+ fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), Error> {
+ let prefix_len = ranges
+ .iter()
+ .zip(&self.state.uncompiled)
+ .take_while(|&(range, node)| {
+ node.last.as_ref().map_or(false, |t| {
+ (t.start, t.end) == (range.start, range.end)
+ })
+ })
+ .count();
+ assert!(prefix_len < ranges.len());
+ self.compile_from(prefix_len)?;
+ self.add_suffix(&ranges[prefix_len..]);
+ Ok(())
+ }
+
+ fn compile_from(&mut self, from: usize) -> Result<(), Error> {
+ let mut next = self.target;
+ while from + 1 < self.state.uncompiled.len() {
+ let node = self.pop_freeze(next);
+ next = self.compile(node)?;
+ }
+ self.top_last_freeze(next);
+ Ok(())
+ }
+
+ fn compile(&mut self, node: Vec<Transition>) -> Result<StateID, Error> {
+ let hash = self.state.compiled.hash(&node);
+ if let Some(id) = self.state.compiled.get(&node, hash) {
+ return Ok(id);
+ }
+ let id = self.nfac.add_sparse(node.clone())?;
+ self.state.compiled.set(node, hash, id);
+ Ok(id)
+ }
+
+ fn add_suffix(&mut self, ranges: &[Utf8Range]) {
+ assert!(!ranges.is_empty());
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ assert!(self.state.uncompiled[last].last.is_none());
+ self.state.uncompiled[last].last = Some(Utf8LastTransition {
+ start: ranges[0].start,
+ end: ranges[0].end,
+ });
+ for r in &ranges[1..] {
+ self.state.uncompiled.push(Utf8Node {
+ trans: vec![],
+ last: Some(Utf8LastTransition { start: r.start, end: r.end }),
+ });
+ }
+ }
+
+ fn add_empty(&mut self) {
+ self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
+ }
+
+ fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
+ let mut uncompiled = self.state.uncompiled.pop().unwrap();
+ uncompiled.set_last_transition(next);
+ uncompiled.trans
+ }
+
+ fn pop_root(&mut self) -> Vec<Transition> {
+ assert_eq!(self.state.uncompiled.len(), 1);
+ assert!(self.state.uncompiled[0].last.is_none());
+ self.state.uncompiled.pop().expect("non-empty nodes").trans
+ }
+
+ fn top_last_freeze(&mut self, next: StateID) {
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ self.state.uncompiled[last].set_last_transition(next);
+ }
+}
+
+impl Utf8Node {
+ fn set_last_transition(&mut self, next: StateID) {
+ if let Some(last) = self.last.take() {
+ self.trans.push(Transition {
+ start: last.start,
+ end: last.end,
+ next,
+ });
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::vec::Vec;
+
+ use super::{
+ Builder, Config, PatternID, SparseTransitions, State, StateID,
+ Transition, NFA,
+ };
+
+ fn build(pattern: &str) -> NFA {
+ Builder::new()
+ .configure(Config::new().captures(false).unanchored_prefix(false))
+ .build(pattern)
+ .unwrap()
+ }
+
+ fn pid(id: usize) -> PatternID {
+ PatternID::new(id).unwrap()
+ }
+
+ fn sid(id: usize) -> StateID {
+ StateID::new(id).unwrap()
+ }
+
+ fn s_byte(byte: u8, next: usize) -> State {
+ let next = sid(next);
+ let trans = Transition { start: byte, end: byte, next };
+ State::Range { range: trans }
+ }
+
+ fn s_range(start: u8, end: u8, next: usize) -> State {
+ let next = sid(next);
+ let trans = Transition { start, end, next };
+ State::Range { range: trans }
+ }
+
+ fn s_sparse(ranges: &[(u8, u8, usize)]) -> State {
+ let ranges = ranges
+ .iter()
+ .map(|&(start, end, next)| Transition {
+ start,
+ end,
+ next: sid(next),
+ })
+ .collect();
+ State::Sparse(SparseTransitions { ranges })
+ }
+
+ fn s_union(alts: &[usize]) -> State {
+ State::Union {
+ alternates: alts
+ .iter()
+ .map(|&id| sid(id))
+ .collect::<Vec<StateID>>()
+ .into_boxed_slice(),
+ }
+ }
+
+ fn s_match(id: usize) -> State {
+ State::Match { id: pid(id) }
+ }
+
+ // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
+ // prefix.
+ #[test]
+ fn compile_unanchored_prefix() {
+ // When the machine can only match valid UTF-8.
+ let nfa = Builder::new()
+ .configure(Config::new().captures(false))
+ .build(r"a")
+ .unwrap();
+ // There should be many states since the `.` in `(?s:.)*?` matches any
+ // Unicode scalar value.
+ assert_eq!(11, nfa.len());
+ assert_eq!(nfa.states[10], s_match(0));
+ assert_eq!(nfa.states[9], s_byte(b'a', 10));
+
+ // When the machine can match through invalid UTF-8.
+ let nfa = Builder::new()
+ .configure(Config::new().captures(false).utf8(false))
+ .build(r"a")
+ .unwrap();
+ assert_eq!(
+ nfa.states,
+ &[
+ s_union(&[2, 1]),
+ s_range(0, 255, 0),
+ s_byte(b'a', 3),
+ s_match(0),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_empty() {
+ assert_eq!(build("").states, &[s_match(0),]);
+ }
+
+ #[test]
+ fn compile_literal() {
+ assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(0),]);
+ assert_eq!(
+ build("ab").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
+ );
+ assert_eq!(
+ build("ā˜ƒ").states,
+ &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
+ );
+
+ // Check that non-UTF-8 literals work.
+ let nfa = Builder::new()
+ .configure(
+ Config::new()
+ .captures(false)
+ .utf8(false)
+ .unanchored_prefix(false),
+ )
+ .syntax(crate::SyntaxConfig::new().utf8(false))
+ .build(r"(?-u)\xFF")
+ .unwrap();
+ assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(0),]);
+ }
+
+ #[test]
+ fn compile_class() {
+ assert_eq!(
+ build(r"[a-z]").states,
+ &[s_range(b'a', b'z', 1), s_match(0),]
+ );
+ assert_eq!(
+ build(r"[x-za-c]").states,
+ &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4]").states,
+ &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
+ &[
+ s_range(0xB1, 0xB4, 5),
+ s_range(0x99, 0x9E, 5),
+ s_byte(0xA4, 1),
+ s_byte(0x9F, 2),
+ s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
+ s_match(0),
+ ]
+ );
+ assert_eq!(
+ build(r"[a-zā˜ƒ]").states,
+ &[
+ s_byte(0x83, 3),
+ s_byte(0x98, 0),
+ s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
+ s_match(0),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_repetition() {
+ assert_eq!(
+ build(r"a?").states,
+ &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(0),]
+ );
+ assert_eq!(
+ build(r"a??").states,
+ &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(0),]
+ );
+ }
+
+ #[test]
+ fn compile_group() {
+ assert_eq!(
+ build(r"ab+").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(0)]
+ );
+ assert_eq!(
+ build(r"(ab)").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
+ );
+ assert_eq!(
+ build(r"(ab)+").states,
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(0)]
+ );
+ }
+
+ #[test]
+ fn compile_alternation() {
+ assert_eq!(
+ build(r"a|b").states,
+ &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(0)]
+ );
+ assert_eq!(
+ build(r"|b").states,
+ &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(0)]
+ );
+ assert_eq!(
+ build(r"a|").states,
+ &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(0)]
+ );
+ }
+
+ #[test]
+ fn many_start_pattern() {
+ let nfa = Builder::new()
+ .configure(Config::new().captures(false).unanchored_prefix(false))
+ .build_many(&["a", "b"])
+ .unwrap();
+ assert_eq!(
+ nfa.states,
+ &[
+ s_byte(b'a', 1),
+ s_match(0),
+ s_byte(b'b', 3),
+ s_match(1),
+ s_union(&[0, 2]),
+ ]
+ );
+ assert_eq!(nfa.start_anchored().as_usize(), 4);
+ assert_eq!(nfa.start_unanchored().as_usize(), 4);
+ // Test that the start states for each individual pattern are correct.
+ assert_eq!(nfa.start_pattern(pid(0)), sid(0));
+ assert_eq!(nfa.start_pattern(pid(1)), sid(2));
+ }
+}