summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex-automata/src/nfa/thompson/compiler.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex-automata/src/nfa/thompson/compiler.rs')
-rw-r--r--third_party/rust/regex-automata/src/nfa/thompson/compiler.rs2257
1 files changed, 2257 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/nfa/thompson/compiler.rs b/third_party/rust/regex-automata/src/nfa/thompson/compiler.rs
new file mode 100644
index 0000000000..065e9ef270
--- /dev/null
+++ b/third_party/rust/regex-automata/src/nfa/thompson/compiler.rs
@@ -0,0 +1,2257 @@
+use core::{borrow::Borrow, cell::RefCell};
+
+use alloc::{sync::Arc, vec, vec::Vec};
+
+use regex_syntax::{
+ hir::{self, Hir},
+ utf8::{Utf8Range, Utf8Sequences},
+ ParserBuilder,
+};
+
+use crate::{
+ nfa::thompson::{
+ builder::Builder,
+ error::BuildError,
+ literal_trie::LiteralTrie,
+ map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
+ nfa::{Transition, NFA},
+ range_trie::RangeTrie,
+ },
+ util::{
+ look::{Look, LookMatcher},
+ primitives::{PatternID, StateID},
+ },
+};
+
+/// The configuration used for a Thompson NFA compiler.
+#[derive(Clone, Debug, Default)]
+pub struct Config {
+ utf8: Option<bool>,
+ reverse: Option<bool>,
+ nfa_size_limit: Option<Option<usize>>,
+ shrink: Option<bool>,
+ which_captures: Option<WhichCaptures>,
+ look_matcher: Option<LookMatcher>,
+ #[cfg(test)]
+ unanchored_prefix: Option<bool>,
+}
+
+impl Config {
+ /// Return a new default Thompson NFA compiler configuration.
+ pub fn new() -> Config {
+ Config::default()
+ }
+
+ /// Whether to enable UTF-8 mode during search or not.
+ ///
+ /// A regex engine is said to be in UTF-8 mode when it guarantees that
+ /// all matches returned by it have spans consisting of only valid UTF-8.
+ /// That is, it is impossible for a match span to be returned that
+ /// contains any invalid UTF-8.
+ ///
+ /// UTF-8 mode generally consists of two things:
+ ///
+ /// 1. Whether the NFA's states are constructed such that all paths to a
+ /// match state that consume at least one byte always correspond to valid
+ /// UTF-8.
+ /// 2. Whether all paths to a match state that do _not_ consume any bytes
+ /// should always correspond to valid UTF-8 boundaries.
+ ///
+ /// (1) is a guarantee made by whoever constructs the NFA.
+ /// If you're parsing a regex from its concrete syntax, then
+ /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
+ /// this guarantee for you. It does it by returning an error if the regex
+ /// pattern could every report a non-empty match span that contains invalid
+ /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
+ /// successfully parses, then you're guaranteed that the corresponding NFA
+ /// will only ever report non-empty match spans containing valid UTF-8.
+ ///
+ /// (2) is a trickier guarantee because it cannot be enforced by the NFA
+ /// state graph itself. Consider, for example, the regex `a*`. It matches
+ /// the empty strings in `ā˜ƒ` at positions `0`, `1`, `2` and `3`, where
+ /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
+ /// and thus correspond to invalid UTF-8 boundaries. Therefore, this
+ /// guarantee must be made at a higher level than the NFA state graph
+ /// itself. This crate deals with this case in each regex engine. Namely,
+ /// when a zero-width match that splits a codepoint is found and UTF-8
+ /// mode enabled, then it is ignored and the engine moves on looking for
+ /// the next match.
+ ///
+ /// Thus, UTF-8 mode is both a promise that the NFA built only reports
+ /// non-empty matches that are valid UTF-8, and an *instruction* to regex
+ /// engines that empty matches that split codepoints should be banned.
+ ///
+ /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
+ /// it only makes sense to enable this option when you *know* your haystack
+ /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
+ /// searching a haystack that contains invalid UTF-8 leads to **unspecified
+ /// behavior**.
+ ///
+ /// Therefore, it may make sense to enable `syntax::Config::utf8` while
+ /// simultaneously *disabling* this option. That would ensure all non-empty
+ /// match spans are valid UTF-8, but that empty match spans may still split
+ /// a codepoint or match at other places that aren't valid UTF-8.
+ ///
+ /// In general, this mode is only relevant if your regex can match the
+ /// empty string. Most regexes don't.
+ ///
+ /// This is enabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how UTF-8 mode can impact the match spans that may
+ /// be reported in certain cases.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// Match, Input,
+ /// };
+ ///
+ /// let re = PikeVM::new("")?;
+ /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
+ ///
+ /// // UTF-8 mode is enabled by default.
+ /// let mut input = Input::new("ā˜ƒ");
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
+ ///
+ /// // Even though an empty regex matches at 1..1, our next match is
+ /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
+ /// // three bytes long).
+ /// input.set_start(1);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
+ ///
+ /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().utf8(false))
+ /// .build("")?;
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
+ ///
+ /// input.set_start(2);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
+ ///
+ /// input.set_start(3);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
+ ///
+ /// input.set_start(4);
+ /// re.search(&mut cache, &input, &mut caps);
+ /// assert_eq!(None, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn utf8(mut self, yes: bool) -> Config {
+ self.utf8 = Some(yes);
+ self
+ }
+
+ /// Reverse the NFA.
+ ///
+ /// A NFA reversal is performed by reversing all of the concatenated
+ /// sub-expressions in the original pattern, recursively. (Look around
+ /// operators are also inverted.) The resulting NFA can be used to match
+ /// the pattern starting from the end of a string instead of the beginning
+ /// of a string.
+ ///
+ /// Reversing the NFA is useful for building a reverse DFA, which is most
+ /// useful for finding the start of a match after its ending position has
+ /// been found. NFA execution engines typically do not work on reverse
+ /// NFAs. For example, currently, the Pike VM reports the starting location
+ /// of matches without a reverse NFA.
+ ///
+ /// Currently, enabling this setting requires disabling the
+ /// [`captures`](Config::captures) setting. If both are enabled, then the
+ /// compiler will return an error. It is expected that this limitation will
+ /// be lifted in the future.
+ ///
+ /// This is disabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example shows how to build a DFA from a reverse NFA, and then use
+ /// the DFA to search backwards.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// dfa::{self, Automaton},
+ /// nfa::thompson::{NFA, WhichCaptures},
+ /// HalfMatch, Input,
+ /// };
+ ///
+ /// let dfa = dfa::dense::Builder::new()
+ /// .thompson(NFA::config()
+ /// .which_captures(WhichCaptures::None)
+ /// .reverse(true)
+ /// )
+ /// .build("baz[0-9]+")?;
+ /// let expected = Some(HalfMatch::must(0, 3));
+ /// assert_eq!(
+ /// expected,
+ /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn reverse(mut self, yes: bool) -> Config {
+ self.reverse = Some(yes);
+ self
+ }
+
+ /// Sets an approximate size limit on the total heap used by the NFA being
+ /// compiled.
+ ///
+ /// This permits imposing constraints on the size of a compiled NFA. This
+ /// may be useful in contexts where the regex pattern is untrusted and one
+ /// wants to avoid using too much memory.
+ ///
+ /// This size limit does not apply to auxiliary heap used during
+ /// compilation that is not part of the built NFA.
+ ///
+ /// Note that this size limit is applied during compilation in order for
+ /// the limit to prevent too much heap from being used. However, the
+ /// implementation may use an intermediate NFA representation that is
+ /// otherwise slightly bigger than the final public form. Since the size
+ /// limit may be applied to an intermediate representation, there is not
+ /// necessarily a precise correspondence between the configured size limit
+ /// and the heap usage of the final NFA.
+ ///
+ /// There is no size limit by default.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates how Unicode mode can greatly increase the
+ /// size of the NFA.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// // 300KB isn't enough!
+ /// NFA::compiler()
+ /// .configure(NFA::config().nfa_size_limit(Some(300_000)))
+ /// .build(r"\w{20}")
+ /// .unwrap_err();
+ ///
+ /// // ... but 400KB probably is.
+ /// let nfa = NFA::compiler()
+ /// .configure(NFA::config().nfa_size_limit(Some(400_000)))
+ /// .build(r"\w{20}")?;
+ ///
+ /// assert_eq!(nfa.pattern_len(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
+ self.nfa_size_limit = Some(bytes);
+ self
+ }
+
+ /// Apply best effort heuristics to shrink the NFA at the expense of more
+ /// time/memory.
+ ///
+ /// Generally speaking, if one is using an NFA to compile a DFA, then the
+ /// extra time used to shrink the NFA will be more than made up for during
+ /// DFA construction (potentially by a lot). In other words, enabling this
+ /// can substantially decrease the overall amount of time it takes to build
+ /// a DFA.
+ ///
+ /// A reason to keep this disabled is if you want to compile an NFA and
+ /// start using it as quickly as possible without needing to build a DFA,
+ /// and you don't mind using a bit of extra memory for the NFA. e.g., for
+ /// an NFA simulation or for a lazy DFA.
+ ///
+ /// NFA shrinking is currently most useful when compiling a reverse
+ /// NFA with large Unicode character classes. In particular, it trades
+ /// additional CPU time during NFA compilation in favor of generating fewer
+ /// NFA states.
+ ///
+ /// This is disabled by default because it can increase compile times
+ /// quite a bit if you aren't building a full DFA.
+ ///
+ /// # Example
+ ///
+ /// This example shows that NFA shrinking can lead to substantial space
+ /// savings in some cases. Notice that, as noted above, we build a reverse
+ /// DFA and use a pattern with a large Unicode character class.
+ ///
+ /// ```
+ /// # if cfg!(miri) { return Ok(()); } // miri takes too long
+ /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
+ ///
+ /// // Currently we have to disable captures when enabling reverse NFA.
+ /// let config = NFA::config()
+ /// .which_captures(WhichCaptures::None)
+ /// .reverse(true);
+ /// let not_shrunk = NFA::compiler()
+ /// .configure(config.clone().shrink(false))
+ /// .build(r"\w")?;
+ /// let shrunk = NFA::compiler()
+ /// .configure(config.clone().shrink(true))
+ /// .build(r"\w")?;
+ ///
+ /// // While a specific shrink factor is not guaranteed, the savings can be
+ /// // considerable in some cases.
+ /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn shrink(mut self, yes: bool) -> Config {
+ self.shrink = Some(yes);
+ self
+ }
+
+ /// Whether to include 'Capture' states in the NFA.
+ ///
+ /// Currently, enabling this setting requires disabling the
+ /// [`reverse`](Config::reverse) setting. If both are enabled, then the
+ /// compiler will return an error. It is expected that this limitation will
+ /// be lifted in the future.
+ ///
+ /// This is enabled by default.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates that some regex engines, like the Pike VM,
+ /// require capturing states to be present in the NFA to report match
+ /// offsets.
+ ///
+ /// (Note that since this method is deprecated, the example below uses
+ /// [`Config::which_captures`] to disable capture states.)
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{
+ /// pikevm::PikeVM,
+ /// NFA,
+ /// WhichCaptures,
+ /// };
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, "abc"));
+ /// assert_eq!(None, re.find(&mut cache, "abc"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ #[deprecated(since = "0.3.5", note = "use which_captures instead")]
+ pub fn captures(self, yes: bool) -> Config {
+ self.which_captures(if yes {
+ WhichCaptures::All
+ } else {
+ WhichCaptures::None
+ })
+ }
+
+ /// Configures what kinds of capture groups are compiled into
+ /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
+ /// Thompson NFA.
+ ///
+ /// Currently, using any option except for [`WhichCaptures::None`] requires
+ /// disabling the [`reverse`](Config::reverse) setting. If both are
+ /// enabled, then the compiler will return an error. It is expected that
+ /// this limitation will be lifted in the future.
+ ///
+ /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
+ /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
+ /// overhead of capture states for explicit groups. Usually this occurs
+ /// when one wants to use the `PikeVM` only for determining the overall
+ /// match. Otherwise, the `PikeVM` could use much more memory than is
+ /// necessary.
+ ///
+ /// # Example
+ ///
+ /// This example demonstrates that some regex engines, like the Pike VM,
+ /// require capturing states to be present in the NFA to report match
+ /// offsets.
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{
+ /// pikevm::PikeVM,
+ /// NFA,
+ /// WhichCaptures,
+ /// };
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.is_match(&mut cache, "abc"));
+ /// assert_eq!(None, re.find(&mut cache, "abc"));
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ ///
+ /// The same applies to the bounded backtracker:
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::{
+ /// backtrack::BoundedBacktracker,
+ /// NFA,
+ /// WhichCaptures,
+ /// };
+ ///
+ /// let re = BoundedBacktracker::builder()
+ /// .thompson(NFA::config().which_captures(WhichCaptures::None))
+ /// .build(r"[a-z]+")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// assert!(re.try_is_match(&mut cache, "abc")?);
+ /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
+ self.which_captures = Some(which_captures);
+ self
+ }
+
+ /// Sets the look-around matcher that should be used with this NFA.
+ ///
+ /// A look-around matcher determines how to match look-around assertions.
+ /// In particular, some assertions are configurable. For example, the
+ /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
+ /// from the default of `\n` to any other byte.
+ ///
+ /// # Example
+ ///
+ /// This shows how to change the line terminator for multi-line assertions.
+ ///
+ /// ```
+ /// use regex_automata::{
+ /// nfa::thompson::{self, pikevm::PikeVM},
+ /// util::look::LookMatcher,
+ /// Match, Input,
+ /// };
+ ///
+ /// let mut lookm = LookMatcher::new();
+ /// lookm.set_line_terminator(b'\x00');
+ ///
+ /// let re = PikeVM::builder()
+ /// .thompson(thompson::Config::new().look_matcher(lookm))
+ /// .build(r"(?m)^[a-z]+$")?;
+ /// let mut cache = re.create_cache();
+ ///
+ /// // Multi-line assertions now use NUL as a terminator.
+ /// assert_eq!(
+ /// Some(Match::must(0, 1..4)),
+ /// re.find(&mut cache, b"\x00abc\x00"),
+ /// );
+ /// // ... and \n is no longer recognized as a terminator.
+ /// assert_eq!(
+ /// None,
+ /// re.find(&mut cache, b"\nabc\n"),
+ /// );
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn look_matcher(mut self, m: LookMatcher) -> Config {
+ self.look_matcher = Some(m);
+ self
+ }
+
+ /// Whether to compile an unanchored prefix into this NFA.
+ ///
+ /// This is enabled by default. It is made available for tests only to make
+ /// it easier to unit test the output of the compiler.
+ #[cfg(test)]
+ fn unanchored_prefix(mut self, yes: bool) -> Config {
+ self.unanchored_prefix = Some(yes);
+ self
+ }
+
+ /// Returns whether this configuration has enabled UTF-8 mode.
+ pub fn get_utf8(&self) -> bool {
+ self.utf8.unwrap_or(true)
+ }
+
+ /// Returns whether this configuration has enabled reverse NFA compilation.
+ pub fn get_reverse(&self) -> bool {
+ self.reverse.unwrap_or(false)
+ }
+
+ /// Return the configured NFA size limit, if it exists, in the number of
+ /// bytes of heap used.
+ pub fn get_nfa_size_limit(&self) -> Option<usize> {
+ self.nfa_size_limit.unwrap_or(None)
+ }
+
+ /// Return whether NFA shrinking is enabled.
+ pub fn get_shrink(&self) -> bool {
+ self.shrink.unwrap_or(false)
+ }
+
+ /// Return whether NFA compilation is configured to produce capture states.
+ #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
+ pub fn get_captures(&self) -> bool {
+ self.get_which_captures().is_any()
+ }
+
+ /// Return what kinds of capture states will be compiled into an NFA.
+ pub fn get_which_captures(&self) -> WhichCaptures {
+ self.which_captures.unwrap_or(WhichCaptures::All)
+ }
+
+ /// Return the look-around matcher for this NFA.
+ pub fn get_look_matcher(&self) -> LookMatcher {
+ self.look_matcher.clone().unwrap_or(LookMatcher::default())
+ }
+
+ /// Return whether NFA compilation is configured to include an unanchored
+ /// prefix.
+ ///
+ /// This is always false when not in test mode.
+ fn get_unanchored_prefix(&self) -> bool {
+ #[cfg(test)]
+ {
+ self.unanchored_prefix.unwrap_or(true)
+ }
+ #[cfg(not(test))]
+ {
+ true
+ }
+ }
+
+ /// Overwrite the default configuration such that the options in `o` are
+ /// always used. If an option in `o` is not set, then the corresponding
+ /// option in `self` is used. If it's not set in `self` either, then it
+ /// remains not set.
+ pub(crate) fn overwrite(&self, o: Config) -> Config {
+ Config {
+ utf8: o.utf8.or(self.utf8),
+ reverse: o.reverse.or(self.reverse),
+ nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
+ shrink: o.shrink.or(self.shrink),
+ which_captures: o.which_captures.or(self.which_captures),
+ look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
+ #[cfg(test)]
+ unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
+ }
+ }
+}
+
+/// A configuration indicating which kinds of
+/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
+///
+/// This configuration can be used with [`Config::which_captures`] to control
+/// which capture states are compiled into a Thompson NFA.
+///
+/// The default configuration is [`WhichCaptures::All`].
+#[derive(Clone, Copy, Debug)]
+pub enum WhichCaptures {
+ /// All capture states, including those corresponding to both implicit and
+ /// explicit capture groups, are included in the Thompson NFA.
+ All,
+ /// Only capture states corresponding to implicit capture groups are
+ /// included. Implicit capture groups appear in every pattern implicitly
+ /// and correspond to the overall match of a pattern.
+ ///
+ /// This is useful when one only cares about the overall match of a
+ /// pattern. By excluding capture states from explicit capture groups,
+ /// one might be able to reduce the memory usage of a multi-pattern regex
+ /// substantially if it was otherwise written to have many explicit capture
+ /// groups.
+ Implicit,
+ /// No capture states are compiled into the Thompson NFA.
+ ///
+ /// This is useful when capture states are either not needed (for example,
+ /// if one is only trying to build a DFA) or if they aren't supported (for
+ /// example, a reverse NFA).
+ None,
+}
+
+impl Default for WhichCaptures {
+ fn default() -> WhichCaptures {
+ WhichCaptures::All
+ }
+}
+
+impl WhichCaptures {
+ /// Returns true if this configuration indicates that no capture states
+ /// should be produced in an NFA.
+ pub fn is_none(&self) -> bool {
+ matches!(*self, WhichCaptures::None)
+ }
+
+ /// Returns true if this configuration indicates that some capture states
+ /// should be added to an NFA. Note that this might only include capture
+ /// states for implicit capture groups.
+ pub fn is_any(&self) -> bool {
+ !self.is_none()
+ }
+}
+
+/*
+This compiler below uses Thompson's construction algorithm. The compiler takes
+a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
+is structured in a way that permits it to be executed by a virtual machine and
+also used to efficiently build a DFA.
+
+The compiler deals with a slightly expanded set of NFA states than what is
+in a final NFA (as exhibited by builder::State and nfa::State). Notably a
+compiler state includes an empty node that has exactly one unconditional
+epsilon transition to the next state. In other words, it's a "goto" instruction
+if one views Thompson's NFA as a set of bytecode instructions. These goto
+instructions are removed in a subsequent phase before returning the NFA to the
+caller. The purpose of these empty nodes is that they make the construction
+algorithm substantially simpler to implement. We remove them before returning
+to the caller because they can represent substantial overhead when traversing
+the NFA graph (either while searching using the NFA directly or while building
+a DFA).
+
+In the future, it would be nice to provide a Glushkov compiler as well, as it
+would work well as a bit-parallel NFA for smaller regexes. But the Thompson
+construction is one I'm more familiar with and seems more straight-forward to
+deal with when it comes to large Unicode character classes.
+
+Internally, the compiler uses interior mutability to improve composition in the
+face of the borrow checker. In particular, we'd really like to be able to write
+things like this:
+
+ self.c_concat(exprs.iter().map(|e| self.c(e)))
+
+Which elegantly uses iterators to build up a sequence of compiled regex
+sub-expressions and then hands it off to the concatenating compiler routine.
+Without interior mutability, the borrow checker won't let us borrow `self`
+mutably both inside and outside the closure at the same time.
+*/
+
+/// A builder for compiling an NFA from a regex's high-level intermediate
+/// representation (HIR).
+///
+/// This compiler provides a way to translate a parsed regex pattern into an
+/// NFA state graph. The NFA state graph can either be used directly to execute
+/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
+///
+/// This compiler provides APIs both for compiling regex patterns directly from
+/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
+///
+/// This compiler has various options that may be configured via
+/// [`thompson::Config`](Config).
+///
+/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
+/// A `Builder` provides a lower level API that is uncoupled from a regex
+/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
+/// together an NFA by hand. See its docs for examples.
+///
+/// # Example: compilation from concrete syntax
+///
+/// This shows how to compile an NFA from a pattern string while setting a size
+/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
+///
+/// ```
+/// use regex_automata::{
+/// nfa::thompson::{NFA, pikevm::PikeVM},
+/// Match,
+/// };
+///
+/// let config = NFA::config().nfa_size_limit(Some(1_000));
+/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
+///
+/// let re = PikeVM::new_from_nfa(nfa)?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+/// let expected = Some(Match::must(0, 3..4));
+/// re.captures(&mut cache, "!@#A#@!", &mut caps);
+/// assert_eq!(expected, caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+///
+/// # Example: compilation from HIR
+///
+/// This shows how to hand assemble a regular expression via its HIR, and then
+/// compile an NFA directly from it.
+///
+/// ```
+/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+///
+/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
+/// ClassBytesRange::new(b'0', b'9'),
+/// ClassBytesRange::new(b'A', b'Z'),
+/// ClassBytesRange::new(b'_', b'_'),
+/// ClassBytesRange::new(b'a', b'z'),
+/// ])));
+///
+/// let config = NFA::config().nfa_size_limit(Some(1_000));
+/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
+///
+/// let re = PikeVM::new_from_nfa(nfa)?;
+/// let mut cache = re.create_cache();
+/// let mut caps = re.create_captures();
+/// let expected = Some(Match::must(0, 3..4));
+/// re.captures(&mut cache, "!@#A#@!", &mut caps);
+/// assert_eq!(expected, caps.get_match());
+///
+/// # Ok::<(), Box<dyn std::error::Error>>(())
+/// ```
+#[derive(Clone, Debug)]
+pub struct Compiler {
+ /// A regex parser, used when compiling an NFA directly from a pattern
+ /// string.
+ parser: ParserBuilder,
+ /// The compiler configuration.
+ config: Config,
+ /// The builder for actually constructing an NFA. This provides a
+ /// convenient abstraction for writing a compiler.
+ builder: RefCell<Builder>,
+ /// State used for compiling character classes to UTF-8 byte automata.
+ /// State is not retained between character class compilations. This just
+ /// serves to amortize allocation to the extent possible.
+ utf8_state: RefCell<Utf8State>,
+ /// State used for arranging character classes in reverse into a trie.
+ trie_state: RefCell<RangeTrie>,
+ /// State used for caching common suffixes when compiling reverse UTF-8
+ /// automata (for Unicode character classes).
+ utf8_suffix: RefCell<Utf8SuffixMap>,
+}
+
+impl Compiler {
+ /// Create a new NFA builder with its default configuration.
+ pub fn new() -> Compiler {
+ Compiler {
+ parser: ParserBuilder::new(),
+ config: Config::default(),
+ builder: RefCell::new(Builder::new()),
+ utf8_state: RefCell::new(Utf8State::new()),
+ trie_state: RefCell::new(RangeTrie::new()),
+ utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
+ }
+ }
+
+ /// Compile the given regular expression pattern into an NFA.
+ ///
+ /// If there was a problem parsing the regex, then that error is returned.
+ ///
+ /// Otherwise, if there was a problem building the NFA, then an error is
+ /// returned. The only error that can occur is if the compiled regex would
+ /// exceed the size limits configured on this builder, or if any part of
+ /// the NFA would exceed the integer representations used. (For example,
+ /// too many states might plausibly occur on a 16-bit target.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ /// let expected = Some(Match::must(0, 3..4));
+ /// re.captures(&mut cache, "!@#A#@!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
+ self.build_many(&[pattern])
+ }
+
+ /// Compile the given regular expression patterns into a single NFA.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_many(&[
+ /// r"(?-u)\s",
+ /// r"(?-u)\w",
+ /// ])?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ /// let expected = Some(Match::must(1, 1..2));
+ /// re.captures(&mut cache, "!A! !A!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_many<P: AsRef<str>>(
+ &self,
+ patterns: &[P],
+ ) -> Result<NFA, BuildError> {
+ let mut hirs = vec![];
+ for p in patterns {
+ hirs.push(
+ self.parser
+ .build()
+ .parse(p.as_ref())
+ .map_err(BuildError::syntax)?,
+ );
+ debug!("parsed: {:?}", p.as_ref());
+ }
+ self.build_many_from_hir(&hirs)
+ }
+
+ /// Compile the given high level intermediate representation of a regular
+ /// expression into an NFA.
+ ///
+ /// If there was a problem building the NFA, then an error is returned. The
+ /// only error that can occur is if the compiled regex would exceed the
+ /// size limits configured on this builder, or if any part of the NFA would
+ /// exceed the integer representations used. (For example, too many states
+ /// might plausibly occur on a 16-bit target.)
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'0', b'9'),
+ /// ClassBytesRange::new(b'A', b'Z'),
+ /// ClassBytesRange::new(b'_', b'_'),
+ /// ClassBytesRange::new(b'a', b'z'),
+ /// ])));
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ /// let expected = Some(Match::must(0, 3..4));
+ /// re.captures(&mut cache, "!@#A#@!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
+ self.build_many_from_hir(&[expr])
+ }
+
+ /// Compile the given high level intermediate representations of regular
+ /// expressions into a single NFA.
+ ///
+ /// When matches are returned, the pattern ID corresponds to the index of
+ /// the pattern in the slice given.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
+ /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
+ ///
+ /// let hirs = &[
+ /// Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'\t', b'\r'),
+ /// ClassBytesRange::new(b' ', b' '),
+ /// ]))),
+ /// Hir::class(Class::Bytes(ClassBytes::new(vec![
+ /// ClassBytesRange::new(b'0', b'9'),
+ /// ClassBytesRange::new(b'A', b'Z'),
+ /// ClassBytesRange::new(b'_', b'_'),
+ /// ClassBytesRange::new(b'a', b'z'),
+ /// ]))),
+ /// ];
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
+ ///
+ /// let re = PikeVM::new_from_nfa(nfa)?;
+ /// let mut cache = re.create_cache();
+ /// let mut caps = re.create_captures();
+ /// let expected = Some(Match::must(1, 1..2));
+ /// re.captures(&mut cache, "!A! !A!", &mut caps);
+ /// assert_eq!(expected, caps.get_match());
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn build_many_from_hir<H: Borrow<Hir>>(
+ &self,
+ exprs: &[H],
+ ) -> Result<NFA, BuildError> {
+ self.compile(exprs)
+ }
+
+ /// Apply the given NFA configuration options to this builder.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::nfa::thompson::NFA;
+ ///
+ /// let config = NFA::config().nfa_size_limit(Some(1_000));
+ /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
+ /// assert_eq!(nfa.pattern_len(), 1);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn configure(&mut self, config: Config) -> &mut Compiler {
+ self.config = self.config.overwrite(config);
+ self
+ }
+
+ /// Set the syntax configuration for this builder using
+ /// [`syntax::Config`](crate::util::syntax::Config).
+ ///
+ /// This permits setting things like case insensitivity, Unicode and multi
+ /// line mode.
+ ///
+ /// This syntax configuration only applies when an NFA is built directly
+ /// from a pattern string. If an NFA is built from an HIR, then all syntax
+ /// settings are ignored.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use regex_automata::{nfa::thompson::NFA, util::syntax};
+ ///
+ /// let syntax_config = syntax::Config::new().unicode(false);
+ /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
+ /// // If Unicode were enabled, the number of states would be much bigger.
+ /// assert!(nfa.states().len() < 15);
+ ///
+ /// # Ok::<(), Box<dyn std::error::Error>>(())
+ /// ```
+ pub fn syntax(
+ &mut self,
+ config: crate::util::syntax::Config,
+ ) -> &mut Compiler {
+ config.apply(&mut self.parser);
+ self
+ }
+}
+
+impl Compiler {
+ /// Compile the sequence of HIR expressions given. Pattern IDs are
+ /// allocated starting from 0, in correspondence with the slice given.
+ ///
+ /// It is legal to provide an empty slice. In that case, the NFA returned
+ /// has no patterns and will never match anything.
+ fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
+ if exprs.len() > PatternID::LIMIT {
+ return Err(BuildError::too_many_patterns(exprs.len()));
+ }
+ if self.config.get_reverse()
+ && self.config.get_which_captures().is_any()
+ {
+ return Err(BuildError::unsupported_captures());
+ }
+
+ self.builder.borrow_mut().clear();
+ self.builder.borrow_mut().set_utf8(self.config.get_utf8());
+ self.builder.borrow_mut().set_reverse(self.config.get_reverse());
+ self.builder
+ .borrow_mut()
+ .set_look_matcher(self.config.get_look_matcher());
+ self.builder
+ .borrow_mut()
+ .set_size_limit(self.config.get_nfa_size_limit())?;
+
+ // We always add an unanchored prefix unless we were specifically told
+ // not to (for tests only), or if we know that the regex is anchored
+ // for all matches. When an unanchored prefix is not added, then the
+ // NFA's anchored and unanchored start states are equivalent.
+ let all_anchored = exprs.iter().all(|e| {
+ e.borrow()
+ .properties()
+ .look_set_prefix()
+ .contains(hir::Look::Start)
+ });
+ let anchored = !self.config.get_unanchored_prefix() || all_anchored;
+ let unanchored_prefix = if anchored {
+ self.c_empty()?
+ } else {
+ self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
+ };
+
+ let compiled = self.c_alt_iter(exprs.iter().map(|e| {
+ let _ = self.start_pattern()?;
+ let one = self.c_cap(0, None, e.borrow())?;
+ let match_state_id = self.add_match()?;
+ self.patch(one.end, match_state_id)?;
+ let _ = self.finish_pattern(one.start)?;
+ Ok(ThompsonRef { start: one.start, end: match_state_id })
+ }))?;
+ self.patch(unanchored_prefix.end, compiled.start)?;
+ let nfa = self
+ .builder
+ .borrow_mut()
+ .build(compiled.start, unanchored_prefix.start)?;
+
+ debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
+ Ok(nfa)
+ }
+
+ /// Compile an arbitrary HIR expression.
+ fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
+ use regex_syntax::hir::{Class, HirKind::*};
+
+ match *expr.kind() {
+ Empty => self.c_empty(),
+ Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
+ Class(Class::Bytes(ref c)) => self.c_byte_class(c),
+ Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
+ Look(ref look) => self.c_look(look),
+ Repetition(ref rep) => self.c_repetition(rep),
+ Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
+ Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
+ Alternation(ref es) => self.c_alt_slice(es),
+ }
+ }
+
+ /// Compile a concatenation of the sub-expressions yielded by the given
+ /// iterator. If the iterator yields no elements, then this compiles down
+ /// to an "empty" state that always matches.
+ ///
+ /// If the compiler is in reverse mode, then the expressions given are
+ /// automatically compiled in reverse.
+ fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
+ where
+ I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
+ {
+ let first = if self.is_reverse() { it.next_back() } else { it.next() };
+ let ThompsonRef { start, mut end } = match first {
+ Some(result) => result?,
+ None => return self.c_empty(),
+ };
+ loop {
+ let next =
+ if self.is_reverse() { it.next_back() } else { it.next() };
+ let compiled = match next {
+ Some(result) => result?,
+ None => break,
+ };
+ self.patch(end, compiled.start)?;
+ end = compiled.end;
+ }
+ Ok(ThompsonRef { start, end })
+ }
+
+ /// Compile an alternation of the given HIR values.
+ ///
+ /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
+ /// of an iterator of compiled NFA subgraphs. The point of accepting a
+ /// slice here is that it opens up some optimization opportunities. For
+ /// example, if all of the HIR values are literals, then this routine might
+ /// re-shuffle them to make NFA epsilon closures substantially faster.
+ fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
+ // self.c_alt_iter(exprs.iter().map(|e| self.c(e)))
+ let literal_count = exprs
+ .iter()
+ .filter(|e| {
+ matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
+ })
+ .count();
+ if literal_count <= 1 || literal_count < exprs.len() {
+ return self.c_alt_iter(exprs.iter().map(|e| self.c(e)));
+ }
+
+ let mut trie = if self.is_reverse() {
+ LiteralTrie::reverse()
+ } else {
+ LiteralTrie::forward()
+ };
+ for expr in exprs.iter() {
+ let literal = match *expr.kind() {
+ hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
+ _ => unreachable!(),
+ };
+ trie.add(literal)?;
+ }
+ trie.compile(&mut self.builder.borrow_mut())
+ }
+
+ /// Compile an alternation, where each element yielded by the given
+ /// iterator represents an item in the alternation. If the iterator yields
+ /// no elements, then this compiles down to a "fail" state.
+ ///
+ /// In an alternation, expressions appearing earlier are "preferred" at
+ /// match time over expressions appearing later. At least, this is true
+ /// when using "leftmost first" match semantics. (If "leftmost longest" are
+ /// ever added in the future, then this preference order of priority would
+ /// not apply in that mode.)
+ fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
+ where
+ I: Iterator<Item = Result<ThompsonRef, BuildError>>,
+ {
+ let first = match it.next() {
+ None => return self.c_fail(),
+ Some(result) => result?,
+ };
+ let second = match it.next() {
+ None => return Ok(first),
+ Some(result) => result?,
+ };
+
+ let union = self.add_union()?;
+ let end = self.add_empty()?;
+ self.patch(union, first.start)?;
+ self.patch(first.end, end)?;
+ self.patch(union, second.start)?;
+ self.patch(second.end, end)?;
+ for result in it {
+ let compiled = result?;
+ self.patch(union, compiled.start)?;
+ self.patch(compiled.end, end)?;
+ }
+ Ok(ThompsonRef { start: union, end })
+ }
+
+ /// Compile the given capture sub-expression. `expr` should be the
+ /// sub-expression contained inside the capture. If "capture" states are
+ /// enabled, then they are added as appropriate.
+ ///
+ /// This accepts the pieces of a capture instead of a `hir::Capture` so
+ /// that it's easy to manufacture a "fake" group when necessary, e.g., for
+ /// adding the entire pattern as if it were a group in order to create
+ /// appropriate "capture" states in the NFA.
+ fn c_cap(
+ &self,
+ index: u32,
+ name: Option<&str>,
+ expr: &Hir,
+ ) -> Result<ThompsonRef, BuildError> {
+ match self.config.get_which_captures() {
+ // No capture states means we always skip them.
+ WhichCaptures::None => return self.c(expr),
+ // Implicit captures states means we only add when index==0 since
+ // index==0 implies the group is implicit.
+ WhichCaptures::Implicit if index > 0 => return self.c(expr),
+ _ => {}
+ }
+
+ let start = self.add_capture_start(index, name)?;
+ let inner = self.c(expr)?;
+ let end = self.add_capture_end(index)?;
+ self.patch(start, inner.start)?;
+ self.patch(inner.end, end)?;
+ Ok(ThompsonRef { start, end })
+ }
+
+ /// Compile the given repetition expression. This handles all types of
+ /// repetitions and greediness.
+ fn c_repetition(
+ &self,
+ rep: &hir::Repetition,
+ ) -> Result<ThompsonRef, BuildError> {
+ match (rep.min, rep.max) {
+ (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
+ (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
+ (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
+ (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
+ }
+ }
+
+ /// Compile the given expression such that it matches at least `min` times,
+ /// but no more than `max` times.
+ ///
+ /// When `greedy` is true, then the preference is for the expression to
+ /// match as much as possible. Otherwise, it will match as little as
+ /// possible.
+ fn c_bounded(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ min: u32,
+ max: u32,
+ ) -> Result<ThompsonRef, BuildError> {
+ let prefix = self.c_exactly(expr, min)?;
+ if min == max {
+ return Ok(prefix);
+ }
+
+ // It is tempting here to compile the rest here as a concatenation
+ // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
+ // were `aaa?a?a?`. The problem here is that it leads to this program:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: union(03, 04)
+ // 000003: 61 => 04
+ // 000004: union(05, 06)
+ // 000005: 61 => 06
+ // 000006: union(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // And effectively, once you hit state 2, the epsilon closure will
+ // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
+ // to instead compile it like so:
+ //
+ // >000000: 61 => 01
+ // 000001: 61 => 02
+ // 000002: union(03, 08)
+ // 000003: 61 => 04
+ // 000004: union(05, 08)
+ // 000005: 61 => 06
+ // 000006: union(07, 08)
+ // 000007: 61 => 08
+ // 000008: MATCH
+ //
+ // So that the epsilon closure of state 2 is now just 3 and 8.
+ let empty = self.add_empty()?;
+ let mut prev_end = prefix.end;
+ for _ in min..max {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_union_reverse()
+ }?;
+ let compiled = self.c(expr)?;
+ self.patch(prev_end, union)?;
+ self.patch(union, compiled.start)?;
+ self.patch(union, empty)?;
+ prev_end = compiled.end;
+ }
+ self.patch(prev_end, empty)?;
+ Ok(ThompsonRef { start: prefix.start, end: empty })
+ }
+
+ /// Compile the given expression such that it may be matched `n` or more
+ /// times, where `n` can be any integer. (Although a particularly large
+ /// integer is likely to run afoul of any configured size limits.)
+ ///
+ /// When `greedy` is true, then the preference is for the expression to
+ /// match as much as possible. Otherwise, it will match as little as
+ /// possible.
+ fn c_at_least(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ n: u32,
+ ) -> Result<ThompsonRef, BuildError> {
+ if n == 0 {
+ // When the expression cannot match the empty string, then we
+ // can get away with something much simpler: just one 'alt'
+ // instruction that optionally repeats itself. But if the expr
+ // can match the empty string... see below.
+ if expr.properties().minimum_len().map_or(false, |len| len > 0) {
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_union_reverse()
+ }?;
+ let compiled = self.c(expr)?;
+ self.patch(union, compiled.start)?;
+ self.patch(compiled.end, union)?;
+ return Ok(ThompsonRef { start: union, end: union });
+ }
+
+ // What's going on here? Shouldn't x* be simpler than this? It
+ // turns out that when implementing leftmost-first (Perl-like)
+ // match semantics, x* results in an incorrect preference order
+ // when computing the transitive closure of states if and only if
+ // 'x' can match the empty string. So instead, we compile x* as
+ // (x+)?, which preserves the correct preference order.
+ //
+ // See: https://github.com/rust-lang/regex/issues/779
+ let compiled = self.c(expr)?;
+ let plus = if greedy {
+ self.add_union()
+ } else {
+ self.add_union_reverse()
+ }?;
+ self.patch(compiled.end, plus)?;
+ self.patch(plus, compiled.start)?;
+
+ let question = if greedy {
+ self.add_union()
+ } else {
+ self.add_union_reverse()
+ }?;
+ let empty = self.add_empty()?;
+ self.patch(question, compiled.start)?;
+ self.patch(question, empty)?;
+ self.patch(plus, empty)?;
+ Ok(ThompsonRef { start: question, end: empty })
+ } else if n == 1 {
+ let compiled = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_union_reverse()
+ }?;
+ self.patch(compiled.end, union)?;
+ self.patch(union, compiled.start)?;
+ Ok(ThompsonRef { start: compiled.start, end: union })
+ } else {
+ let prefix = self.c_exactly(expr, n - 1)?;
+ let last = self.c(expr)?;
+ let union = if greedy {
+ self.add_union()
+ } else {
+ self.add_union_reverse()
+ }?;
+ self.patch(prefix.end, last.start)?;
+ self.patch(last.end, union)?;
+ self.patch(union, last.start)?;
+ Ok(ThompsonRef { start: prefix.start, end: union })
+ }
+ }
+
+ /// Compile the given expression such that it may be matched zero or one
+ /// times.
+ ///
+ /// When `greedy` is true, then the preference is for the expression to
+ /// match as much as possible. Otherwise, it will match as little as
+ /// possible.
+ fn c_zero_or_one(
+ &self,
+ expr: &Hir,
+ greedy: bool,
+ ) -> Result<ThompsonRef, BuildError> {
+ let union =
+ if greedy { self.add_union() } else { self.add_union_reverse() }?;
+ let compiled = self.c(expr)?;
+ let empty = self.add_empty()?;
+ self.patch(union, compiled.start)?;
+ self.patch(union, empty)?;
+ self.patch(compiled.end, empty)?;
+ Ok(ThompsonRef { start: union, end: empty })
+ }
+
+ /// Compile the given HIR expression exactly `n` times.
+ fn c_exactly(
+ &self,
+ expr: &Hir,
+ n: u32,
+ ) -> Result<ThompsonRef, BuildError> {
+ let it = (0..n).map(|_| self.c(expr));
+ self.c_concat(it)
+ }
+
+ /// Compile the given byte oriented character class.
+ ///
+ /// This uses "sparse" states to represent an alternation between ranges in
+ /// this character class. We can use "sparse" states instead of stitching
+ /// together a "union" state because all ranges in a character class have
+ /// equal priority *and* are non-overlapping (thus, only one can match, so
+ /// there's never a question of priority in the first place). This saves a
+ /// fair bit of overhead when traversing an NFA.
+ ///
+ /// This routine compiles an empty character class into a "fail" state.
+ fn c_byte_class(
+ &self,
+ cls: &hir::ClassBytes,
+ ) -> Result<ThompsonRef, BuildError> {
+ let end = self.add_empty()?;
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ trans.push(Transition {
+ start: r.start(),
+ end: r.end(),
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
+ }
+
+ /// Compile the given Unicode character class.
+ ///
+ /// This routine specifically tries to use various types of compression,
+ /// since UTF-8 automata of large classes can get quite large. The specific
+ /// type of compression used depends on forward vs reverse compilation, and
+ /// whether NFA shrinking is enabled or not.
+ ///
+ /// Aside from repetitions causing lots of repeat group, this is like the
+ /// single most expensive part of regex compilation. Therefore, a large part
+ /// of the expense of compilation may be reduce by disabling Unicode in the
+ /// pattern.
+ ///
+ /// This routine compiles an empty character class into a "fail" state.
+ fn c_unicode_class(
+ &self,
+ cls: &hir::ClassUnicode,
+ ) -> Result<ThompsonRef, BuildError> {
+ // If all we have are ASCII ranges wrapped in a Unicode package, then
+ // there is zero reason to bring out the big guns. We can fit all ASCII
+ // ranges within a single sparse state.
+ if cls.is_ascii() {
+ let end = self.add_empty()?;
+ let mut trans = Vec::with_capacity(cls.ranges().len());
+ for r in cls.iter() {
+ // The unwraps below are OK because we've verified that this
+ // class only contains ASCII codepoints.
+ trans.push(Transition {
+ // FIXME(1.59): use the 'TryFrom<char> for u8' impl.
+ start: u8::try_from(u32::from(r.start())).unwrap(),
+ end: u8::try_from(u32::from(r.end())).unwrap(),
+ next: end,
+ });
+ }
+ Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
+ } else if self.is_reverse() {
+ if !self.config.get_shrink() {
+ // When we don't want to spend the extra time shrinking, we
+ // compile the UTF-8 automaton in reverse using something like
+ // the "naive" approach, but will attempt to re-use common
+ // suffixes.
+ self.c_unicode_class_reverse_with_suffix(cls)
+ } else {
+ // When we want to shrink our NFA for reverse UTF-8 automata,
+ // we cannot feed UTF-8 sequences directly to the UTF-8
+ // compiler, since the UTF-8 compiler requires all sequences
+ // to be lexicographically sorted. Instead, we organize our
+ // sequences into a range trie, which can then output our
+ // sequences in the correct order. Unfortunately, building the
+ // range trie is fairly expensive (but not nearly as expensive
+ // as building a DFA). Hence the reason why the 'shrink' option
+ // exists, so that this path can be toggled off. For example,
+ // we might want to turn this off if we know we won't be
+ // compiling a DFA.
+ let mut trie = self.trie_state.borrow_mut();
+ trie.clear();
+
+ for rng in cls.iter() {
+ for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ seq.reverse();
+ trie.insert(seq.as_slice());
+ }
+ }
+ let mut builder = self.builder.borrow_mut();
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c =
+ Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
+ trie.iter(|seq| {
+ utf8c.add(&seq)?;
+ Ok(())
+ })?;
+ utf8c.finish()
+ }
+ } else {
+ // In the forward direction, we always shrink our UTF-8 automata
+ // because we can stream it right into the UTF-8 compiler. There
+ // is almost no downside (in either memory or time) to using this
+ // approach.
+ let mut builder = self.builder.borrow_mut();
+ let mut utf8_state = self.utf8_state.borrow_mut();
+ let mut utf8c =
+ Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
+ for rng in cls.iter() {
+ for seq in Utf8Sequences::new(rng.start(), rng.end()) {
+ utf8c.add(seq.as_slice())?;
+ }
+ }
+ utf8c.finish()
+ }
+
+ // For reference, the code below is the "naive" version of compiling a
+ // UTF-8 automaton. It is deliciously simple (and works for both the
+ // forward and reverse cases), but will unfortunately produce very
+ // large NFAs. When compiling a forward automaton, the size difference
+ // can sometimes be an order of magnitude. For example, the '\w' regex
+ // will generate about ~3000 NFA states using the naive approach below,
+ // but only 283 states when using the approach above. This is because
+ // the approach above actually compiles a *minimal* (or near minimal,
+ // because of the bounded hashmap for reusing equivalent states) UTF-8
+ // automaton.
+ //
+ // The code below is kept as a reference point in order to make it
+ // easier to understand the higher level goal here. Although, it will
+ // almost certainly bit-rot, so keep that in mind. Also, if you try to
+ // use it, some of the tests in this module will fail because they look
+ // for terser byte code produce by the more optimized handling above.
+ // But the integration test suite should still pass.
+ //
+ // One good example of the substantial difference this can make is to
+ // compare and contrast performance of the Pike VM when the code below
+ // is active vs the code above. Here's an example to try:
+ //
+ // regex-cli find match pikevm -b -p '(?m)^\w{20}' -y '@$smallishru'
+ //
+ // With Unicode classes generated below, this search takes about 45s on
+ // my machine. But with the compressed version above, the search takes
+ // only around 1.4s. The NFA is also 20% smaller. This is in part due
+ // to the compression, but also because of the utilization of 'sparse'
+ // NFA states. They lead to much less state shuffling during the NFA
+ // search.
+ /*
+ let it = cls
+ .iter()
+ .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
+ .map(|seq| {
+ let it = seq
+ .as_slice()
+ .iter()
+ .map(|rng| self.c_range(rng.start, rng.end));
+ self.c_concat(it)
+ });
+ self.c_alt_iter(it)
+ */
+ }
+
+ /// Compile the given Unicode character class in reverse with suffix
+ /// caching.
+ ///
+ /// This is a "quick" way to compile large Unicode classes into reverse
+ /// UTF-8 automata while doing a small amount of compression on that
+ /// automata by reusing common suffixes.
+ ///
+ /// A more comprehensive compression scheme can be accomplished by using
+ /// a range trie to efficiently sort a reverse sequence of UTF-8 byte
+ /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`.
+ ///
+ /// This is the technique used when "NFA shrinking" is disabled.
+ ///
+ /// (This also tries to use "sparse" states where possible, just like
+ /// `c_byte_class` does.)
+ fn c_unicode_class_reverse_with_suffix(
+ &self,
+ cls: &hir::ClassUnicode,
+ ) -> Result<ThompsonRef, BuildError> {
+ // N.B. It would likely be better to cache common *prefixes* in the
+ // reverse direction, but it's not quite clear how to do that. The
+ // advantage of caching suffixes is that it does give us a win, and
+ // has a very small additional overhead.
+ let mut cache = self.utf8_suffix.borrow_mut();
+ cache.clear();
+
+ let union = self.add_union()?;
+ let alt_end = self.add_empty()?;
+ for urng in cls.iter() {
+ for seq in Utf8Sequences::new(urng.start(), urng.end()) {
+ let mut end = alt_end;
+ for brng in seq.as_slice() {
+ let key = Utf8SuffixKey {
+ from: end,
+ start: brng.start,
+ end: brng.end,
+ };
+ let hash = cache.hash(&key);
+ if let Some(id) = cache.get(&key, hash) {
+ end = id;
+ continue;
+ }
+
+ let compiled = self.c_range(brng.start, brng.end)?;
+ self.patch(compiled.end, end)?;
+ end = compiled.start;
+ cache.set(key, hash, end);
+ }
+ self.patch(union, end)?;
+ }
+ }
+ Ok(ThompsonRef { start: union, end: alt_end })
+ }
+
+ /// Compile the given HIR look-around assertion to an NFA look-around
+ /// assertion.
+ fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
+ let look = match *anchor {
+ hir::Look::Start => Look::Start,
+ hir::Look::End => Look::End,
+ hir::Look::StartLF => Look::StartLF,
+ hir::Look::EndLF => Look::EndLF,
+ hir::Look::StartCRLF => Look::StartCRLF,
+ hir::Look::EndCRLF => Look::EndCRLF,
+ hir::Look::WordAscii => Look::WordAscii,
+ hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
+ hir::Look::WordUnicode => Look::WordUnicode,
+ hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
+ };
+ let id = self.add_look(look)?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ /// Compile the given byte string to a concatenation of bytes.
+ fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
+ self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b)))
+ }
+
+ /// Compile a "range" state with one transition that may only be followed
+ /// if the input byte is in the (inclusive) range given.
+ ///
+ /// Both the `start` and `end` locations point to the state created.
+ /// Callers will likely want to keep the `start`, but patch the `end` to
+ /// point to some other state.
+ fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
+ let id = self.add_range(start, end)?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ /// Compile an "empty" state with one unconditional epsilon transition.
+ ///
+ /// Both the `start` and `end` locations point to the state created.
+ /// Callers will likely want to keep the `start`, but patch the `end` to
+ /// point to some other state.
+ fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
+ let id = self.add_empty()?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ /// Compile a "fail" state that can never have any outgoing transitions.
+ fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
+ let id = self.add_fail()?;
+ Ok(ThompsonRef { start: id, end: id })
+ }
+
+ // The below helpers are meant to be simple wrappers around the
+ // corresponding Builder methods. For the most part, they let us write
+ // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
+ // the latter is a mouthful. Some of the methods do inject a little bit
+ // of extra logic. e.g., Flipping look-around operators when compiling in
+ // reverse mode.
+
+ fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
+ self.builder.borrow_mut().patch(from, to)
+ }
+
+ fn start_pattern(&self) -> Result<PatternID, BuildError> {
+ self.builder.borrow_mut().start_pattern()
+ }
+
+ fn finish_pattern(
+ &self,
+ start_id: StateID,
+ ) -> Result<PatternID, BuildError> {
+ self.builder.borrow_mut().finish_pattern(start_id)
+ }
+
+ fn add_empty(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_empty()
+ }
+
+ fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_range(Transition {
+ start,
+ end,
+ next: StateID::ZERO,
+ })
+ }
+
+ fn add_sparse(
+ &self,
+ ranges: Vec<Transition>,
+ ) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_sparse(ranges)
+ }
+
+ fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
+ if self.is_reverse() {
+ look = look.reversed();
+ }
+ self.builder.borrow_mut().add_look(StateID::ZERO, look)
+ }
+
+ fn add_union(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_union(vec![])
+ }
+
+ fn add_union_reverse(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_union_reverse(vec![])
+ }
+
+ fn add_capture_start(
+ &self,
+ capture_index: u32,
+ name: Option<&str>,
+ ) -> Result<StateID, BuildError> {
+ let name = name.map(|n| Arc::from(n));
+ self.builder.borrow_mut().add_capture_start(
+ StateID::ZERO,
+ capture_index,
+ name,
+ )
+ }
+
+ fn add_capture_end(
+ &self,
+ capture_index: u32,
+ ) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
+ }
+
+ fn add_fail(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_fail()
+ }
+
+ fn add_match(&self) -> Result<StateID, BuildError> {
+ self.builder.borrow_mut().add_match()
+ }
+
+ fn is_reverse(&self) -> bool {
+ self.config.get_reverse()
+ }
+}
+
+/// A value that represents the result of compiling a sub-expression of a
+/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
+/// has an initial state at `start` and a final state at `end`.
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct ThompsonRef {
+ pub(crate) start: StateID,
+ pub(crate) end: StateID,
+}
+
+/// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs
+/// from a lexicographically sorted sequence of strings in linear time.
+///
+/// The trick here is that any Unicode codepoint range can be converted to
+/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
+/// together via an alternation is trivial, and indeed, it works. However,
+/// there is a lot of redundant structure in many UTF-8 automatons. Since our
+/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
+/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
+/// minimal because we use a bounded cache of previously build DFA states.)
+///
+/// The drawback is that this sadly doesn't work for reverse automata, since
+/// the ranges are no longer in lexicographic order. For that, we invented the
+/// range trie (which gets its own module). Once a range trie is built, we then
+/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
+///
+/// The high level idea is described here:
+/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
+///
+/// There is also another implementation of this in the `fst` crate.
+#[derive(Debug)]
+struct Utf8Compiler<'a> {
+ builder: &'a mut Builder,
+ state: &'a mut Utf8State,
+ target: StateID,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8State {
+ compiled: Utf8BoundedMap,
+ uncompiled: Vec<Utf8Node>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8Node {
+ trans: Vec<Transition>,
+ last: Option<Utf8LastTransition>,
+}
+
+#[derive(Clone, Debug)]
+struct Utf8LastTransition {
+ start: u8,
+ end: u8,
+}
+
+impl Utf8State {
+ fn new() -> Utf8State {
+ Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
+ }
+
+ fn clear(&mut self) {
+ self.compiled.clear();
+ self.uncompiled.clear();
+ }
+}
+
+impl<'a> Utf8Compiler<'a> {
+ fn new(
+ builder: &'a mut Builder,
+ state: &'a mut Utf8State,
+ ) -> Result<Utf8Compiler<'a>, BuildError> {
+ let target = builder.add_empty()?;
+ state.clear();
+ let mut utf8c = Utf8Compiler { builder, state, target };
+ utf8c.add_empty();
+ Ok(utf8c)
+ }
+
+ fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
+ self.compile_from(0)?;
+ let node = self.pop_root();
+ let start = self.compile(node)?;
+ Ok(ThompsonRef { start, end: self.target })
+ }
+
+ fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
+ let prefix_len = ranges
+ .iter()
+ .zip(&self.state.uncompiled)
+ .take_while(|&(range, node)| {
+ node.last.as_ref().map_or(false, |t| {
+ (t.start, t.end) == (range.start, range.end)
+ })
+ })
+ .count();
+ assert!(prefix_len < ranges.len());
+ self.compile_from(prefix_len)?;
+ self.add_suffix(&ranges[prefix_len..]);
+ Ok(())
+ }
+
+ fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
+ let mut next = self.target;
+ while from + 1 < self.state.uncompiled.len() {
+ let node = self.pop_freeze(next);
+ next = self.compile(node)?;
+ }
+ self.top_last_freeze(next);
+ Ok(())
+ }
+
+ fn compile(
+ &mut self,
+ node: Vec<Transition>,
+ ) -> Result<StateID, BuildError> {
+ let hash = self.state.compiled.hash(&node);
+ if let Some(id) = self.state.compiled.get(&node, hash) {
+ return Ok(id);
+ }
+ let id = self.builder.add_sparse(node.clone())?;
+ self.state.compiled.set(node, hash, id);
+ Ok(id)
+ }
+
+ fn add_suffix(&mut self, ranges: &[Utf8Range]) {
+ assert!(!ranges.is_empty());
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ assert!(self.state.uncompiled[last].last.is_none());
+ self.state.uncompiled[last].last = Some(Utf8LastTransition {
+ start: ranges[0].start,
+ end: ranges[0].end,
+ });
+ for r in &ranges[1..] {
+ self.state.uncompiled.push(Utf8Node {
+ trans: vec![],
+ last: Some(Utf8LastTransition { start: r.start, end: r.end }),
+ });
+ }
+ }
+
+ fn add_empty(&mut self) {
+ self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
+ }
+
+ fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
+ let mut uncompiled = self.state.uncompiled.pop().unwrap();
+ uncompiled.set_last_transition(next);
+ uncompiled.trans
+ }
+
+ fn pop_root(&mut self) -> Vec<Transition> {
+ assert_eq!(self.state.uncompiled.len(), 1);
+ assert!(self.state.uncompiled[0].last.is_none());
+ self.state.uncompiled.pop().expect("non-empty nodes").trans
+ }
+
+ fn top_last_freeze(&mut self, next: StateID) {
+ let last = self
+ .state
+ .uncompiled
+ .len()
+ .checked_sub(1)
+ .expect("non-empty nodes");
+ self.state.uncompiled[last].set_last_transition(next);
+ }
+}
+
+impl Utf8Node {
+ fn set_last_transition(&mut self, next: StateID) {
+ if let Some(last) = self.last.take() {
+ self.trans.push(Transition {
+ start: last.start,
+ end: last.end,
+ next,
+ });
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use alloc::{vec, vec::Vec};
+
+ use crate::{
+ nfa::thompson::{SparseTransitions, State, Transition, NFA},
+ util::primitives::{PatternID, SmallIndex, StateID},
+ };
+
+ use super::*;
+
+ fn build(pattern: &str) -> NFA {
+ NFA::compiler()
+ .configure(
+ NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false),
+ )
+ .build(pattern)
+ .unwrap()
+ }
+
+ fn pid(id: usize) -> PatternID {
+ PatternID::new(id).unwrap()
+ }
+
+ fn sid(id: usize) -> StateID {
+ StateID::new(id).unwrap()
+ }
+
+ fn s_byte(byte: u8, next: usize) -> State {
+ let next = sid(next);
+ let trans = Transition { start: byte, end: byte, next };
+ State::ByteRange { trans }
+ }
+
+ fn s_range(start: u8, end: u8, next: usize) -> State {
+ let next = sid(next);
+ let trans = Transition { start, end, next };
+ State::ByteRange { trans }
+ }
+
+ fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
+ let transitions = transitions
+ .iter()
+ .map(|&(start, end, next)| Transition {
+ start,
+ end,
+ next: sid(next),
+ })
+ .collect();
+ State::Sparse(SparseTransitions { transitions })
+ }
+
+ fn s_bin_union(alt1: usize, alt2: usize) -> State {
+ State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
+ }
+
+ fn s_union(alts: &[usize]) -> State {
+ State::Union {
+ alternates: alts
+ .iter()
+ .map(|&id| sid(id))
+ .collect::<Vec<StateID>>()
+ .into_boxed_slice(),
+ }
+ }
+
+ fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
+ State::Capture {
+ next: sid(next),
+ pattern_id: pid(pattern),
+ group_index: SmallIndex::new(index).unwrap(),
+ slot: SmallIndex::new(slot).unwrap(),
+ }
+ }
+
+ fn s_fail() -> State {
+ State::Fail
+ }
+
+ fn s_match(id: usize) -> State {
+ State::Match { pattern_id: pid(id) }
+ }
+
+ // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
+ // prefix.
+ #[test]
+ fn compile_unanchored_prefix() {
+ let nfa = NFA::compiler()
+ .configure(NFA::config().which_captures(WhichCaptures::None))
+ .build(r"a")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[
+ s_bin_union(2, 1),
+ s_range(0, 255, 0),
+ s_byte(b'a', 3),
+ s_match(0),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_empty() {
+ assert_eq!(build("").states(), &[s_match(0),]);
+ }
+
+ #[test]
+ fn compile_literal() {
+ assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]);
+ assert_eq!(
+ build("ab").states(),
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
+ );
+ assert_eq!(
+ build("ā˜ƒ").states(),
+ &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
+ );
+
+ // Check that non-UTF-8 literals work.
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false),
+ )
+ .syntax(crate::util::syntax::Config::new().utf8(false))
+ .build(r"(?-u)\xFF")
+ .unwrap();
+ assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]);
+ }
+
+ #[test]
+ fn compile_class_ascii() {
+ assert_eq!(
+ build(r"[a-z]").states(),
+ &[s_range(b'a', b'z', 1), s_match(0),]
+ );
+ assert_eq!(
+ build(r"[x-za-c]").states(),
+ &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
+ );
+ }
+
+ #[test]
+ #[cfg(not(miri))]
+ fn compile_class_unicode() {
+ assert_eq!(
+ build(r"[\u03B1-\u03B4]").states(),
+ &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
+ );
+ assert_eq!(
+ build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
+ &[
+ s_range(0xB1, 0xB4, 5),
+ s_range(0x99, 0x9E, 5),
+ s_byte(0xA4, 1),
+ s_byte(0x9F, 2),
+ s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
+ s_match(0),
+ ]
+ );
+ assert_eq!(
+ build(r"[a-zā˜ƒ]").states(),
+ &[
+ s_byte(0x83, 3),
+ s_byte(0x98, 0),
+ s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
+ s_match(0),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_repetition() {
+ assert_eq!(
+ build(r"a?").states(),
+ &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),]
+ );
+ assert_eq!(
+ build(r"a??").states(),
+ &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),]
+ );
+ }
+
+ #[test]
+ fn compile_group() {
+ assert_eq!(
+ build(r"ab+").states(),
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)]
+ );
+ assert_eq!(
+ build(r"(ab)").states(),
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
+ );
+ assert_eq!(
+ build(r"(ab)+").states(),
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)]
+ );
+ }
+
+ #[test]
+ fn compile_alternation() {
+ assert_eq!(
+ build(r"a|b").states(),
+ &[s_range(b'a', b'b', 1), s_match(0)]
+ );
+ assert_eq!(
+ build(r"ab|cd").states(),
+ &[
+ s_byte(b'b', 3),
+ s_byte(b'd', 3),
+ s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]),
+ s_match(0)
+ ],
+ );
+ assert_eq!(
+ build(r"|b").states(),
+ &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)]
+ );
+ assert_eq!(
+ build(r"a|").states(),
+ &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)]
+ );
+ }
+
+ // This tests the use of a non-binary union, i.e., a state with more than
+ // 2 unconditional epsilon transitions. The only place they tend to appear
+ // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
+ // and 'sparse' tend to cover all other cases of alternation.
+ #[test]
+ fn compile_non_binary_union() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .which_captures(WhichCaptures::None)
+ .reverse(true)
+ .shrink(false)
+ .unanchored_prefix(false),
+ )
+ .build(r"[\u1000\u2000\u3000]")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[
+ s_union(&[3, 6, 9]),
+ s_byte(0xE1, 10),
+ s_byte(0x80, 1),
+ s_byte(0x80, 2),
+ s_byte(0xE2, 10),
+ s_byte(0x80, 4),
+ s_byte(0x80, 5),
+ s_byte(0xE3, 10),
+ s_byte(0x80, 7),
+ s_byte(0x80, 8),
+ s_match(0),
+ ]
+ );
+ }
+
+ #[test]
+ fn compile_many_start_pattern() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false),
+ )
+ .build_many(&["a", "b"])
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[
+ s_byte(b'a', 1),
+ s_match(0),
+ s_byte(b'b', 3),
+ s_match(1),
+ s_bin_union(0, 2),
+ ]
+ );
+ assert_eq!(nfa.start_anchored().as_usize(), 4);
+ assert_eq!(nfa.start_unanchored().as_usize(), 4);
+ // Test that the start states for each individual pattern are correct.
+ assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0));
+ assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2));
+ }
+
+ // This tests that our compiler can handle an empty character class. At the
+ // time of writing, the regex parser forbids it, so the only way to test it
+ // is to provide a hand written HIR.
+ #[test]
+ fn empty_class_bytes() {
+ use regex_syntax::hir::{Class, ClassBytes, Hir};
+
+ let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
+ let config = NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false);
+ let nfa =
+ NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
+ assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
+ }
+
+ // Like empty_class_bytes, but for a Unicode class.
+ #[test]
+ fn empty_class_unicode() {
+ use regex_syntax::hir::{Class, ClassUnicode, Hir};
+
+ let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
+ let config = NFA::config()
+ .which_captures(WhichCaptures::None)
+ .unanchored_prefix(false);
+ let nfa =
+ NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
+ assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
+ }
+
+ #[test]
+ fn compile_captures_all() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .unanchored_prefix(false)
+ .which_captures(WhichCaptures::All),
+ )
+ .build("a(b)c")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[
+ s_cap(1, 0, 0, 0),
+ s_byte(b'a', 2),
+ s_cap(3, 0, 1, 2),
+ s_byte(b'b', 4),
+ s_cap(5, 0, 1, 3),
+ s_byte(b'c', 6),
+ s_cap(7, 0, 0, 1),
+ s_match(0)
+ ]
+ );
+ let ginfo = nfa.group_info();
+ assert_eq!(2, ginfo.all_group_len());
+ }
+
+ #[test]
+ fn compile_captures_implicit() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .unanchored_prefix(false)
+ .which_captures(WhichCaptures::Implicit),
+ )
+ .build("a(b)c")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[
+ s_cap(1, 0, 0, 0),
+ s_byte(b'a', 2),
+ s_byte(b'b', 3),
+ s_byte(b'c', 4),
+ s_cap(5, 0, 0, 1),
+ s_match(0)
+ ]
+ );
+ let ginfo = nfa.group_info();
+ assert_eq!(1, ginfo.all_group_len());
+ }
+
+ #[test]
+ fn compile_captures_none() {
+ let nfa = NFA::compiler()
+ .configure(
+ NFA::config()
+ .unanchored_prefix(false)
+ .which_captures(WhichCaptures::None),
+ )
+ .build("a(b)c")
+ .unwrap();
+ assert_eq!(
+ nfa.states(),
+ &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
+ );
+ let ginfo = nfa.group_info();
+ assert_eq!(0, ginfo.all_group_len());
+ }
+}