summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/src/exec.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex/src/exec.rs')
-rw-r--r--third_party/rust/regex/src/exec.rs1655
1 files changed, 1655 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/exec.rs b/third_party/rust/regex/src/exec.rs
new file mode 100644
index 0000000000..e75ca083a0
--- /dev/null
+++ b/third_party/rust/regex/src/exec.rs
@@ -0,0 +1,1655 @@
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::panic::AssertUnwindSafe;
+use std::sync::Arc;
+
+#[cfg(feature = "perf-literal")]
+use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
+use regex_syntax::hir::literal::Literals;
+use regex_syntax::hir::Hir;
+use regex_syntax::ParserBuilder;
+
+use crate::backtrack;
+use crate::compile::Compiler;
+#[cfg(feature = "perf-dfa")]
+use crate::dfa;
+use crate::error::Error;
+use crate::input::{ByteInput, CharInput};
+use crate::literal::LiteralSearcher;
+use crate::pikevm;
+use crate::pool::{Pool, PoolGuard};
+use crate::prog::Program;
+use crate::re_builder::RegexOptions;
+use crate::re_bytes;
+use crate::re_set;
+use crate::re_trait::{Locations, RegularExpression, Slot};
+use crate::re_unicode;
+use crate::utf8::next_utf8;
+
+/// `Exec` manages the execution of a regular expression.
+///
+/// In particular, this manages the various compiled forms of a single regular
+/// expression and the choice of which matching engine to use to execute a
+/// regular expression.
+#[derive(Debug)]
+pub struct Exec {
+ /// All read only state.
+ ro: Arc<ExecReadOnly>,
+ /// A pool of reusable values for the various matching engines.
+ ///
+ /// Note that boxing this value is not strictly necessary, but it is an
+ /// easy way to ensure that T does not bloat the stack sized used by a pool
+ /// in the case where T is big. And this turns out to be the case at the
+ /// time of writing for regex's use of this pool. At the time of writing,
+ /// the size of a Regex on the stack is 856 bytes. Boxing this value
+ /// reduces that size to 16 bytes.
+ pool: Box<Pool<ProgramCache>>,
+}
+
+/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
+/// means it is no longer Sync, but we can now avoid the overhead of
+/// synchronization to fetch the cache.
+#[derive(Debug)]
+pub struct ExecNoSync<'c> {
+ /// All read only state.
+ ro: &'c Arc<ExecReadOnly>,
+ /// Caches for the various matching engines.
+ cache: PoolGuard<'c, ProgramCache>,
+}
+
+/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
+#[derive(Debug)]
+pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
+
+/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
+/// state is determined at compile time and never changes during search.
+#[derive(Debug)]
+struct ExecReadOnly {
+ /// The original regular expressions given by the caller to compile.
+ res: Vec<String>,
+ /// A compiled program that is used in the NFA simulation and backtracking.
+ /// It can be byte-based or Unicode codepoint based.
+ ///
+ /// N.B. It is not possibly to make this byte-based from the public API.
+ /// It is only used for testing byte based programs in the NFA simulations.
+ nfa: Program,
+ /// A compiled byte based program for DFA execution. This is only used
+ /// if a DFA can be executed. (Currently, only word boundary assertions are
+ /// not supported.) Note that this program contains an embedded `.*?`
+ /// preceding the first capture group, unless the regex is anchored at the
+ /// beginning.
+ dfa: Program,
+ /// The same as above, except the program is reversed (and there is no
+ /// preceding `.*?`). This is used by the DFA to find the starting location
+ /// of matches.
+ dfa_reverse: Program,
+ /// A set of suffix literals extracted from the regex.
+ ///
+ /// Prefix literals are stored on the `Program`, since they are used inside
+ /// the matching engines.
+ suffixes: LiteralSearcher,
+ /// An Aho-Corasick automaton with leftmost-first match semantics.
+ ///
+ /// This is only set when the entire regex is a simple unanchored
+ /// alternation of literals. We could probably use it more circumstances,
+ /// but this is already hacky enough in this architecture.
+ ///
+ /// N.B. We use u32 as a state ID representation under the assumption that
+ /// if we were to exhaust the ID space, we probably would have long
+ /// surpassed the compilation size limit.
+ #[cfg(feature = "perf-literal")]
+ ac: Option<AhoCorasick<u32>>,
+ /// match_type encodes as much upfront knowledge about how we're going to
+ /// execute a search as possible.
+ match_type: MatchType,
+}
+
+/// Facilitates the construction of an executor by exposing various knobs
+/// to control how a regex is executed and what kinds of resources it's
+/// permitted to use.
+// `ExecBuilder` is only public via the `internal` module, so avoid deriving
+// `Debug`.
+#[allow(missing_debug_implementations)]
+pub struct ExecBuilder {
+ options: RegexOptions,
+ match_type: Option<MatchType>,
+ bytes: bool,
+ only_utf8: bool,
+}
+
+/// Parsed represents a set of parsed regular expressions and their detected
+/// literals.
+struct Parsed {
+ exprs: Vec<Hir>,
+ prefixes: Literals,
+ suffixes: Literals,
+ bytes: bool,
+}
+
+impl ExecBuilder {
+ /// Create a regex execution builder.
+ ///
+ /// This uses default settings for everything except the regex itself,
+ /// which must be provided. Further knobs can be set by calling methods,
+ /// and then finally, `build` to actually create the executor.
+ pub fn new(re: &str) -> Self {
+ Self::new_many(&[re])
+ }
+
+ /// Like new, but compiles the union of the given regular expressions.
+ ///
+ /// Note that when compiling 2 or more regular expressions, capture groups
+ /// are completely unsupported. (This means both `find` and `captures`
+ /// won't work.)
+ pub fn new_many<I, S>(res: I) -> Self
+ where
+ S: AsRef<str>,
+ I: IntoIterator<Item = S>,
+ {
+ let mut opts = RegexOptions::default();
+ opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
+ Self::new_options(opts)
+ }
+
+ /// Create a regex execution builder.
+ pub fn new_options(opts: RegexOptions) -> Self {
+ ExecBuilder {
+ options: opts,
+ match_type: None,
+ bytes: false,
+ only_utf8: true,
+ }
+ }
+
+ /// Set the matching engine to be automatically determined.
+ ///
+ /// This is the default state and will apply whatever optimizations are
+ /// possible, such as running a DFA.
+ ///
+ /// This overrides whatever was previously set via the `nfa` or
+ /// `bounded_backtracking` methods.
+ pub fn automatic(mut self) -> Self {
+ self.match_type = None;
+ self
+ }
+
+ /// Sets the matching engine to use the NFA algorithm no matter what
+ /// optimizations are possible.
+ ///
+ /// This overrides whatever was previously set via the `automatic` or
+ /// `bounded_backtracking` methods.
+ pub fn nfa(mut self) -> Self {
+ self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
+ self
+ }
+
+ /// Sets the matching engine to use a bounded backtracking engine no
+ /// matter what optimizations are possible.
+ ///
+ /// One must use this with care, since the bounded backtracking engine
+ /// uses memory proportion to `len(regex) * len(text)`.
+ ///
+ /// This overrides whatever was previously set via the `automatic` or
+ /// `nfa` methods.
+ pub fn bounded_backtracking(mut self) -> Self {
+ self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
+ self
+ }
+
+ /// Compiles byte based programs for use with the NFA matching engines.
+ ///
+ /// By default, the NFA engines match on Unicode scalar values. They can
+ /// be made to use byte based programs instead. In general, the byte based
+ /// programs are slower because of a less efficient encoding of character
+ /// classes.
+ ///
+ /// Note that this does not impact DFA matching engines, which always
+ /// execute on bytes.
+ pub fn bytes(mut self, yes: bool) -> Self {
+ self.bytes = yes;
+ self
+ }
+
+ /// When disabled, the program compiled may match arbitrary bytes.
+ ///
+ /// When enabled (the default), all compiled programs exclusively match
+ /// valid UTF-8 bytes.
+ pub fn only_utf8(mut self, yes: bool) -> Self {
+ self.only_utf8 = yes;
+ self
+ }
+
+ /// Set the Unicode flag.
+ pub fn unicode(mut self, yes: bool) -> Self {
+ self.options.unicode = yes;
+ self
+ }
+
+ /// Parse the current set of patterns into their AST and extract literals.
+ fn parse(&self) -> Result<Parsed, Error> {
+ let mut exprs = Vec::with_capacity(self.options.pats.len());
+ let mut prefixes = Some(Literals::empty());
+ let mut suffixes = Some(Literals::empty());
+ let mut bytes = false;
+ let is_set = self.options.pats.len() > 1;
+ // If we're compiling a regex set and that set has any anchored
+ // expressions, then disable all literal optimizations.
+ for pat in &self.options.pats {
+ let mut parser = ParserBuilder::new()
+ .octal(self.options.octal)
+ .case_insensitive(self.options.case_insensitive)
+ .multi_line(self.options.multi_line)
+ .dot_matches_new_line(self.options.dot_matches_new_line)
+ .swap_greed(self.options.swap_greed)
+ .ignore_whitespace(self.options.ignore_whitespace)
+ .unicode(self.options.unicode)
+ .allow_invalid_utf8(!self.only_utf8)
+ .nest_limit(self.options.nest_limit)
+ .build();
+ let expr =
+ parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
+ bytes = bytes || !expr.is_always_utf8();
+
+ if cfg!(feature = "perf-literal") {
+ if !expr.is_anchored_start() && expr.is_any_anchored_start() {
+ // Partial anchors unfortunately make it hard to use
+ // prefixes, so disable them.
+ prefixes = None;
+ } else if is_set && expr.is_anchored_start() {
+ // Regex sets with anchors do not go well with literal
+ // optimizations.
+ prefixes = None;
+ }
+ prefixes = prefixes.and_then(|mut prefixes| {
+ if !prefixes.union_prefixes(&expr) {
+ None
+ } else {
+ Some(prefixes)
+ }
+ });
+
+ if !expr.is_anchored_end() && expr.is_any_anchored_end() {
+ // Partial anchors unfortunately make it hard to use
+ // suffixes, so disable them.
+ suffixes = None;
+ } else if is_set && expr.is_anchored_end() {
+ // Regex sets with anchors do not go well with literal
+ // optimizations.
+ suffixes = None;
+ }
+ suffixes = suffixes.and_then(|mut suffixes| {
+ if !suffixes.union_suffixes(&expr) {
+ None
+ } else {
+ Some(suffixes)
+ }
+ });
+ }
+ exprs.push(expr);
+ }
+ Ok(Parsed {
+ exprs,
+ prefixes: prefixes.unwrap_or_else(Literals::empty),
+ suffixes: suffixes.unwrap_or_else(Literals::empty),
+ bytes,
+ })
+ }
+
+ /// Build an executor that can run a regular expression.
+ pub fn build(self) -> Result<Exec, Error> {
+ // Special case when we have no patterns to compile.
+ // This can happen when compiling a regex set.
+ if self.options.pats.is_empty() {
+ let ro = Arc::new(ExecReadOnly {
+ res: vec![],
+ nfa: Program::new(),
+ dfa: Program::new(),
+ dfa_reverse: Program::new(),
+ suffixes: LiteralSearcher::empty(),
+ #[cfg(feature = "perf-literal")]
+ ac: None,
+ match_type: MatchType::Nothing,
+ });
+ let pool = ExecReadOnly::new_pool(&ro);
+ return Ok(Exec { ro, pool });
+ }
+ let parsed = self.parse()?;
+ let mut nfa = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .bytes(self.bytes || parsed.bytes)
+ .only_utf8(self.only_utf8)
+ .compile(&parsed.exprs)?;
+ let mut dfa = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .dfa(true)
+ .only_utf8(self.only_utf8)
+ .compile(&parsed.exprs)?;
+ let mut dfa_reverse = Compiler::new()
+ .size_limit(self.options.size_limit)
+ .dfa(true)
+ .only_utf8(self.only_utf8)
+ .reverse(true)
+ .compile(&parsed.exprs)?;
+
+ #[cfg(feature = "perf-literal")]
+ let ac = self.build_aho_corasick(&parsed);
+ nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
+ dfa.prefixes = nfa.prefixes.clone();
+ dfa.dfa_size_limit = self.options.dfa_size_limit;
+ dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
+
+ let mut ro = ExecReadOnly {
+ res: self.options.pats,
+ nfa,
+ dfa,
+ dfa_reverse,
+ suffixes: LiteralSearcher::suffixes(parsed.suffixes),
+ #[cfg(feature = "perf-literal")]
+ ac,
+ match_type: MatchType::Nothing,
+ };
+ ro.match_type = ro.choose_match_type(self.match_type);
+
+ let ro = Arc::new(ro);
+ let pool = ExecReadOnly::new_pool(&ro);
+ Ok(Exec { ro, pool })
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> {
+ if parsed.exprs.len() != 1 {
+ return None;
+ }
+ let lits = match alternation_literals(&parsed.exprs[0]) {
+ None => return None,
+ Some(lits) => lits,
+ };
+ // If we have a small number of literals, then let Teddy handle
+ // things (see literal/mod.rs).
+ if lits.len() <= 32 {
+ return None;
+ }
+ Some(
+ AhoCorasickBuilder::new()
+ .match_kind(MatchKind::LeftmostFirst)
+ .auto_configure(&lits)
+ .build_with_size::<u32, _, _>(&lits)
+ // This should never happen because we'd long exceed the
+ // compilation limit for regexes first.
+ .expect("AC automaton too big"),
+ )
+ }
+}
+
+impl<'c> RegularExpression for ExecNoSyncStr<'c> {
+ type Text = str;
+
+ fn slots_len(&self) -> usize {
+ self.0.slots_len()
+ }
+
+ fn next_after_empty(&self, text: &str, i: usize) -> usize {
+ next_utf8(text.as_bytes(), i)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
+ self.0.shortest_match_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match_at(&self, text: &str, start: usize) -> bool {
+ self.0.is_match_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
+ self.0.find_at(text.as_bytes(), start)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn captures_read_at(
+ &self,
+ locs: &mut Locations,
+ text: &str,
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ self.0.captures_read_at(locs, text.as_bytes(), start)
+ }
+}
+
+impl<'c> RegularExpression for ExecNoSync<'c> {
+ type Text = [u8];
+
+ /// Returns the number of capture slots in the regular expression. (There
+ /// are two slots for every capture group, corresponding to possibly empty
+ /// start and end locations of the capture.)
+ fn slots_len(&self) -> usize {
+ self.ro.nfa.captures.len() * 2
+ }
+
+ fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
+ i + 1
+ }
+
+ /// Returns the end of a match location, possibly occurring before the
+ /// end location of the correct leftmost-first match.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).map(|(_, e)| e)
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa | MatchType::DfaMany => {
+ match self.shortest_dfa(text, start) {
+ dfa::Result::Match(end) => Some(end),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ true,
+ &text[start..],
+ text.len(),
+ ) {
+ dfa::Result::Match(_) => Some(text.len()),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.shortest_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match(e) => Some(e),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.shortest_nfa(text, start),
+ }
+ }
+ MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
+ MatchType::Nothing => None,
+ }
+ }
+
+ /// Returns true if and only if the regex matches text.
+ ///
+ /// For single regular expressions, this is equivalent to calling
+ /// shortest_match(...).is_some().
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_match_at(&self, text: &[u8], start: usize) -> bool {
+ if !self.is_anchor_end_match(text) {
+ return false;
+ }
+ // We need to do this dance because shortest_match relies on the NFA
+ // filling in captures[1], but a RegexSet has no captures. In other
+ // words, a RegexSet can't (currently) use shortest_match. ---AG
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).is_some()
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa | MatchType::DfaMany => {
+ match self.shortest_dfa(text, start) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ true,
+ &text[start..],
+ text.len(),
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.shortest_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.match_nfa(text, start),
+ }
+ }
+ MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
+ MatchType::Nothing => false,
+ }
+ }
+
+ /// Finds the start and end location of the leftmost-first match, starting
+ /// at the given location.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => self.find_literals(ty, text, start),
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa => match self.find_dfa_forward(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ },
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match self.find_dfa_anchored_reverse(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.find_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match((s, e)) => Some((s, e)),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.find_nfa(MatchNfaType::Auto, text, start)
+ }
+ }
+ }
+ MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
+ MatchType::Nothing => None,
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaMany => {
+ unreachable!("BUG: RegexSet cannot be used with find")
+ }
+ }
+ }
+
+ /// Finds the start and end location of the leftmost-first match and also
+ /// fills in all matching capture groups.
+ ///
+ /// The number of capture slots given should be equal to the total number
+ /// of capture slots in the compiled program.
+ ///
+ /// Note that the first two slots always correspond to the start and end
+ /// locations of the overall match.
+ fn captures_read_at(
+ &self,
+ locs: &mut Locations,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ let slots = locs.as_slots();
+ for slot in slots.iter_mut() {
+ *slot = None;
+ }
+ // If the caller unnecessarily uses this, then we try to save them
+ // from themselves.
+ match slots.len() {
+ 0 => return self.find_at(text, start),
+ 2 => {
+ return self.find_at(text, start).map(|(s, e)| {
+ slots[0] = Some(s);
+ slots[1] = Some(e);
+ (s, e)
+ });
+ }
+ _ => {} // fallthrough
+ }
+ if !self.is_anchor_end_match(text) {
+ return None;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ MatchType::Literal(ty) => {
+ self.find_literals(ty, text, start).and_then(|(s, e)| {
+ self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ )
+ })
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::Dfa => {
+ if self.ro.nfa.is_anchored_start {
+ self.captures_nfa(slots, text, start)
+ } else {
+ match self.find_dfa_forward(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => {
+ self.captures_nfa(slots, text, start)
+ }
+ }
+ }
+ }
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaAnchoredReverse => {
+ match self.find_dfa_anchored_reverse(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.captures_nfa(slots, text, start),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ MatchType::DfaSuffix => {
+ match self.find_dfa_reverse_suffix(text, start) {
+ dfa::Result::Match((s, e)) => self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ s,
+ e,
+ ),
+ dfa::Result::NoMatch(_) => None,
+ dfa::Result::Quit => self.captures_nfa(slots, text, start),
+ }
+ }
+ MatchType::Nfa(ty) => {
+ self.captures_nfa_type(ty, slots, text, start, text.len())
+ }
+ MatchType::Nothing => None,
+ #[cfg(feature = "perf-dfa")]
+ MatchType::DfaMany => {
+ unreachable!("BUG: RegexSet cannot be used with captures")
+ }
+ }
+ }
+}
+
+impl<'c> ExecNoSync<'c> {
+ /// Finds the leftmost-first match using only literal search.
+ #[cfg(feature = "perf-literal")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_literals(
+ &self,
+ ty: MatchLiteralType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ use self::MatchLiteralType::*;
+ match ty {
+ Unanchored => {
+ let lits = &self.ro.nfa.prefixes;
+ lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
+ }
+ AnchoredStart => {
+ let lits = &self.ro.nfa.prefixes;
+ if start == 0 || !self.ro.nfa.is_anchored_start {
+ lits.find_start(&text[start..])
+ .map(|(s, e)| (start + s, start + e))
+ } else {
+ None
+ }
+ }
+ AnchoredEnd => {
+ let lits = &self.ro.suffixes;
+ lits.find_end(&text[start..])
+ .map(|(s, e)| (start + s, start + e))
+ }
+ AhoCorasick => self
+ .ro
+ .ac
+ .as_ref()
+ .unwrap()
+ .find(&text[start..])
+ .map(|m| (start + m.start(), start + m.end())),
+ }
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_forward(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+ let end = match dfa::Fsm::forward(
+ &self.ro.dfa,
+ self.cache.value(),
+ false,
+ text,
+ start,
+ ) {
+ NoMatch(i) => return NoMatch(i),
+ Quit => return Quit,
+ Match(end) if start == end => return Match((start, start)),
+ Match(end) => end,
+ };
+ // Now run the DFA in reverse to find the start of the match.
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..],
+ end - start,
+ ) {
+ Match(s) => Match((start + s, end)),
+ NoMatch(i) => NoMatch(i),
+ Quit => Quit,
+ }
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA,
+ /// but assumes the regex is anchored at the end and therefore starts at
+ /// the end of the regex and matches in reverse.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_anchored_reverse(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..],
+ text.len() - start,
+ ) {
+ Match(s) => Match((start + s, text.len())),
+ NoMatch(i) => NoMatch(i),
+ Quit => Quit,
+ }
+ }
+
+ /// Finds the end of the shortest match using only the DFA.
+ #[cfg(feature = "perf-dfa")]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
+ dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
+ }
+
+ /// Finds the end of the shortest match using only the DFA by scanning for
+ /// suffix literals.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn shortest_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<usize> {
+ match self.exec_dfa_reverse_suffix(text, start) {
+ None => self.shortest_dfa(text, start),
+ Some(r) => r.map(|(_, end)| end),
+ }
+ }
+
+ /// Finds the end of the shortest match using only the DFA by scanning for
+ /// suffix literals. It also reports the start of the match.
+ ///
+ /// Note that if None is returned, then the optimization gave up to avoid
+ /// worst case quadratic behavior. A forward scanning DFA should be tried
+ /// next.
+ ///
+ /// If a match is returned and the full leftmost-first match is desired,
+ /// then a forward scan starting from the beginning of the match must be
+ /// done.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn exec_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ original_start: usize,
+ ) -> Option<dfa::Result<(usize, usize)>> {
+ use crate::dfa::Result::*;
+
+ let lcs = self.ro.suffixes.lcs();
+ debug_assert!(lcs.len() >= 1);
+ let mut start = original_start;
+ let mut end = start;
+ let mut last_literal = start;
+ while end <= text.len() {
+ last_literal += match lcs.find(&text[last_literal..]) {
+ None => return Some(NoMatch(text.len())),
+ Some(i) => i,
+ };
+ end = last_literal + lcs.len();
+ match dfa::Fsm::reverse(
+ &self.ro.dfa_reverse,
+ self.cache.value(),
+ false,
+ &text[start..end],
+ end - start,
+ ) {
+ Match(0) | NoMatch(0) => return None,
+ Match(i) => return Some(Match((start + i, end))),
+ NoMatch(i) => {
+ start += i;
+ last_literal += 1;
+ continue;
+ }
+ Quit => return Some(Quit),
+ };
+ }
+ Some(NoMatch(text.len()))
+ }
+
+ /// Finds the leftmost-first match (start and end) using only the DFA
+ /// by scanning for suffix literals.
+ ///
+ /// If the result returned indicates that the DFA quit, then another
+ /// matching engine should be used.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find_dfa_reverse_suffix(
+ &self,
+ text: &[u8],
+ start: usize,
+ ) -> dfa::Result<(usize, usize)> {
+ use crate::dfa::Result::*;
+
+ let match_start = match self.exec_dfa_reverse_suffix(text, start) {
+ None => return self.find_dfa_forward(text, start),
+ Some(Match((start, _))) => start,
+ Some(r) => return r,
+ };
+ // At this point, we've found a match. The only way to quit now
+ // without a match is if the DFA gives up (seems unlikely).
+ //
+ // Now run the DFA forwards to find the proper end of the match.
+ // (The suffix literal match can only indicate the earliest
+ // possible end location, which may appear before the end of the
+ // leftmost-first match.)
+ match dfa::Fsm::forward(
+ &self.ro.dfa,
+ self.cache.value(),
+ false,
+ text,
+ match_start,
+ ) {
+ NoMatch(_) => panic!("BUG: reverse match implies forward match"),
+ Quit => Quit,
+ Match(e) => Match((match_start, e)),
+ }
+ }
+
+ /// Executes the NFA engine to return whether there is a match or not.
+ ///
+ /// Ideally, we could use shortest_nfa(...).is_some() and get the same
+ /// performance characteristics, but regex sets don't have captures, which
+ /// shortest_nfa depends on.
+ #[cfg(feature = "perf-dfa")]
+ fn match_nfa(&self, text: &[u8], start: usize) -> bool {
+ self.match_nfa_type(MatchNfaType::Auto, text, start)
+ }
+
+ /// Like match_nfa, but allows specification of the type of NFA engine.
+ fn match_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> bool {
+ self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut [],
+ true,
+ false,
+ text,
+ start,
+ text.len(),
+ )
+ }
+
+ /// Finds the shortest match using an NFA.
+ #[cfg(feature = "perf-dfa")]
+ fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
+ self.shortest_nfa_type(MatchNfaType::Auto, text, start)
+ }
+
+ /// Like shortest_nfa, but allows specification of the type of NFA engine.
+ fn shortest_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<usize> {
+ let mut slots = [None, None];
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut slots,
+ true,
+ true,
+ text,
+ start,
+ text.len(),
+ ) {
+ slots[1]
+ } else {
+ None
+ }
+ }
+
+ /// Like find, but executes an NFA engine.
+ fn find_nfa(
+ &self,
+ ty: MatchNfaType,
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ let mut slots = [None, None];
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ &mut slots,
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ) {
+ match (slots[0], slots[1]) {
+ (Some(s), Some(e)) => Some((s, e)),
+ _ => None,
+ }
+ } else {
+ None
+ }
+ }
+
+ /// Like find_nfa, but fills in captures.
+ ///
+ /// `slots` should have length equal to `2 * nfa.captures.len()`.
+ #[cfg(feature = "perf-dfa")]
+ fn captures_nfa(
+ &self,
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ ) -> Option<(usize, usize)> {
+ self.captures_nfa_type(
+ MatchNfaType::Auto,
+ slots,
+ text,
+ start,
+ text.len(),
+ )
+ }
+
+ /// Like captures_nfa, but allows specification of type of NFA engine.
+ fn captures_nfa_type(
+ &self,
+ ty: MatchNfaType,
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> Option<(usize, usize)> {
+ if self.exec_nfa(
+ ty,
+ &mut [false],
+ slots,
+ false,
+ false,
+ text,
+ start,
+ end,
+ ) {
+ match (slots[0], slots[1]) {
+ (Some(s), Some(e)) => Some((s, e)),
+ _ => None,
+ }
+ } else {
+ None
+ }
+ }
+
+ fn exec_nfa(
+ &self,
+ mut ty: MatchNfaType,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ quit_after_match_with_pos: bool,
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ use self::MatchNfaType::*;
+ if let Auto = ty {
+ if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
+ ty = Backtrack;
+ } else {
+ ty = PikeVM;
+ }
+ }
+ // The backtracker can't return the shortest match position as it is
+ // implemented today. So if someone calls `shortest_match` and we need
+ // to run an NFA, then use the PikeVM.
+ if quit_after_match_with_pos || ty == PikeVM {
+ self.exec_pikevm(
+ matches,
+ slots,
+ quit_after_match,
+ text,
+ start,
+ end,
+ )
+ } else {
+ self.exec_backtrack(matches, slots, text, start, end)
+ }
+ }
+
+ /// Always run the NFA algorithm.
+ fn exec_pikevm(
+ &self,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ quit_after_match: bool,
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ if self.ro.nfa.uses_bytes() {
+ pikevm::Fsm::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ quit_after_match,
+ ByteInput::new(text, self.ro.nfa.only_utf8),
+ start,
+ end,
+ )
+ } else {
+ pikevm::Fsm::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ quit_after_match,
+ CharInput::new(text),
+ start,
+ end,
+ )
+ }
+ }
+
+ /// Always runs the NFA using bounded backtracking.
+ fn exec_backtrack(
+ &self,
+ matches: &mut [bool],
+ slots: &mut [Slot],
+ text: &[u8],
+ start: usize,
+ end: usize,
+ ) -> bool {
+ if self.ro.nfa.uses_bytes() {
+ backtrack::Bounded::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ ByteInput::new(text, self.ro.nfa.only_utf8),
+ start,
+ end,
+ )
+ } else {
+ backtrack::Bounded::exec(
+ &self.ro.nfa,
+ self.cache.value(),
+ matches,
+ slots,
+ CharInput::new(text),
+ start,
+ end,
+ )
+ }
+ }
+
+ /// Finds which regular expressions match the given text.
+ ///
+ /// `matches` should have length equal to the number of regexes being
+ /// searched.
+ ///
+ /// This is only useful when one wants to know which regexes in a set
+ /// match some text.
+ pub fn many_matches_at(
+ &self,
+ matches: &mut [bool],
+ text: &[u8],
+ start: usize,
+ ) -> bool {
+ use self::MatchType::*;
+ if !self.is_anchor_end_match(text) {
+ return false;
+ }
+ match self.ro.match_type {
+ #[cfg(feature = "perf-literal")]
+ Literal(ty) => {
+ debug_assert_eq!(matches.len(), 1);
+ matches[0] = self.find_literals(ty, text, start).is_some();
+ matches[0]
+ }
+ #[cfg(feature = "perf-dfa")]
+ Dfa | DfaAnchoredReverse | DfaMany => {
+ match dfa::Fsm::forward_many(
+ &self.ro.dfa,
+ self.cache.value(),
+ matches,
+ text,
+ start,
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.exec_nfa(
+ MatchNfaType::Auto,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ }
+ }
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ DfaSuffix => {
+ match dfa::Fsm::forward_many(
+ &self.ro.dfa,
+ self.cache.value(),
+ matches,
+ text,
+ start,
+ ) {
+ dfa::Result::Match(_) => true,
+ dfa::Result::NoMatch(_) => false,
+ dfa::Result::Quit => self.exec_nfa(
+ MatchNfaType::Auto,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ }
+ }
+ Nfa(ty) => self.exec_nfa(
+ ty,
+ matches,
+ &mut [],
+ false,
+ false,
+ text,
+ start,
+ text.len(),
+ ),
+ Nothing => false,
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn is_anchor_end_match(&self, text: &[u8]) -> bool {
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
+ true
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
+ // Only do this check if the haystack is big (>1MB).
+ if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
+ let lcs = ro.suffixes.lcs();
+ if lcs.len() >= 1 && !lcs.is_suffix(text) {
+ return false;
+ }
+ }
+ true
+ }
+
+ imp(&self.ro, text)
+ }
+
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ &self.ro.nfa.capture_name_idx
+ }
+}
+
+impl<'c> ExecNoSyncStr<'c> {
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ self.0.capture_name_idx()
+ }
+}
+
+impl Exec {
+ /// Get a searcher that isn't Sync.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn searcher(&self) -> ExecNoSync<'_> {
+ ExecNoSync {
+ ro: &self.ro, // a clone is too expensive here! (and not needed)
+ cache: self.pool.get(),
+ }
+ }
+
+ /// Get a searcher that isn't Sync and can match on &str.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
+ ExecNoSyncStr(self.searcher())
+ }
+
+ /// Build a Regex from this executor.
+ pub fn into_regex(self) -> re_unicode::Regex {
+ re_unicode::Regex::from(self)
+ }
+
+ /// Build a RegexSet from this executor.
+ pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
+ re_set::unicode::RegexSet::from(self)
+ }
+
+ /// Build a Regex from this executor that can match arbitrary bytes.
+ pub fn into_byte_regex(self) -> re_bytes::Regex {
+ re_bytes::Regex::from(self)
+ }
+
+ /// Build a RegexSet from this executor that can match arbitrary bytes.
+ pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
+ re_set::bytes::RegexSet::from(self)
+ }
+
+ /// The original regular expressions given by the caller that were
+ /// compiled.
+ pub fn regex_strings(&self) -> &[String] {
+ &self.ro.res
+ }
+
+ /// Return a slice of capture names.
+ ///
+ /// Any capture that isn't named is None.
+ pub fn capture_names(&self) -> &[Option<String>] {
+ &self.ro.nfa.captures
+ }
+
+ /// Return a reference to named groups mapping (from group name to
+ /// group position).
+ pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+ &self.ro.nfa.capture_name_idx
+ }
+}
+
+impl Clone for Exec {
+ fn clone(&self) -> Exec {
+ let pool = ExecReadOnly::new_pool(&self.ro);
+ Exec { ro: self.ro.clone(), pool }
+ }
+}
+
+impl ExecReadOnly {
+ fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
+ if let Some(MatchType::Nfa(_)) = hint {
+ return hint.unwrap();
+ }
+ // If the NFA is empty, then we'll never match anything.
+ if self.nfa.insts.is_empty() {
+ return MatchType::Nothing;
+ }
+ if let Some(literalty) = self.choose_literal_match_type() {
+ return literalty;
+ }
+ if let Some(dfaty) = self.choose_dfa_match_type() {
+ return dfaty;
+ }
+ // We're so totally hosed.
+ MatchType::Nfa(MatchNfaType::Auto)
+ }
+
+ /// If a plain literal scan can be used, then a corresponding literal
+ /// search type is returned.
+ fn choose_literal_match_type(&self) -> Option<MatchType> {
+ #[cfg(not(feature = "perf-literal"))]
+ fn imp(_: &ExecReadOnly) -> Option<MatchType> {
+ None
+ }
+
+ #[cfg(feature = "perf-literal")]
+ fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
+ // If our set of prefixes is complete, then we can use it to find
+ // a match in lieu of a regex engine. This doesn't quite work well
+ // in the presence of multiple regexes, so only do it when there's
+ // one.
+ //
+ // TODO(burntsushi): Also, don't try to match literals if the regex
+ // is partially anchored. We could technically do it, but we'd need
+ // to create two sets of literals: all of them and then the subset
+ // that aren't anchored. We would then only search for all of them
+ // when at the beginning of the input and use the subset in all
+ // other cases.
+ if ro.res.len() != 1 {
+ return None;
+ }
+ if ro.ac.is_some() {
+ return Some(MatchType::Literal(
+ MatchLiteralType::AhoCorasick,
+ ));
+ }
+ if ro.nfa.prefixes.complete() {
+ return if ro.nfa.is_anchored_start {
+ Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
+ } else {
+ Some(MatchType::Literal(MatchLiteralType::Unanchored))
+ };
+ }
+ if ro.suffixes.complete() {
+ return if ro.nfa.is_anchored_end {
+ Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
+ } else {
+ // This case shouldn't happen. When the regex isn't
+ // anchored, then complete prefixes should imply complete
+ // suffixes.
+ Some(MatchType::Literal(MatchLiteralType::Unanchored))
+ };
+ }
+ None
+ }
+
+ imp(self)
+ }
+
+ /// If a DFA scan can be used, then choose the appropriate DFA strategy.
+ fn choose_dfa_match_type(&self) -> Option<MatchType> {
+ #[cfg(not(feature = "perf-dfa"))]
+ fn imp(_: &ExecReadOnly) -> Option<MatchType> {
+ None
+ }
+
+ #[cfg(feature = "perf-dfa")]
+ fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
+ if !dfa::can_exec(&ro.dfa) {
+ return None;
+ }
+ // Regex sets require a slightly specialized path.
+ if ro.res.len() >= 2 {
+ return Some(MatchType::DfaMany);
+ }
+ // If the regex is anchored at the end but not the start, then
+ // just match in reverse from the end of the haystack.
+ if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
+ return Some(MatchType::DfaAnchoredReverse);
+ }
+ #[cfg(feature = "perf-literal")]
+ {
+ // If there's a longish suffix literal, then it might be faster
+ // to look for that first.
+ if ro.should_suffix_scan() {
+ return Some(MatchType::DfaSuffix);
+ }
+ }
+ // Fall back to your garden variety forward searching lazy DFA.
+ Some(MatchType::Dfa)
+ }
+
+ imp(self)
+ }
+
+ /// Returns true if the program is amenable to suffix scanning.
+ ///
+ /// When this is true, as a heuristic, we assume it is OK to quickly scan
+ /// for suffix literals and then do a *reverse* DFA match from any matches
+ /// produced by the literal scan. (And then followed by a forward DFA
+ /// search, since the previously found suffix literal maybe not actually be
+ /// the end of a match.)
+ ///
+ /// This is a bit of a specialized optimization, but can result in pretty
+ /// big performance wins if 1) there are no prefix literals and 2) the
+ /// suffix literals are pretty rare in the text. (1) is obviously easy to
+ /// account for but (2) is harder. As a proxy, we assume that longer
+ /// strings are generally rarer, so we only enable this optimization when
+ /// we have a meaty suffix.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ fn should_suffix_scan(&self) -> bool {
+ if self.suffixes.is_empty() {
+ return false;
+ }
+ let lcs_len = self.suffixes.lcs().char_len();
+ lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
+ }
+
+ fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
+ let ro = ro.clone();
+ Box::new(Pool::new(Box::new(move || {
+ AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
+ })))
+ }
+}
+
+#[derive(Clone, Copy, Debug)]
+enum MatchType {
+ /// A single or multiple literal search. This is only used when the regex
+ /// can be decomposed into a literal search.
+ #[cfg(feature = "perf-literal")]
+ Literal(MatchLiteralType),
+ /// A normal DFA search.
+ #[cfg(feature = "perf-dfa")]
+ Dfa,
+ /// A reverse DFA search starting from the end of a haystack.
+ #[cfg(feature = "perf-dfa")]
+ DfaAnchoredReverse,
+ /// A reverse DFA search with suffix literal scanning.
+ #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
+ DfaSuffix,
+ /// Use the DFA on two or more regular expressions.
+ #[cfg(feature = "perf-dfa")]
+ DfaMany,
+ /// An NFA variant.
+ Nfa(MatchNfaType),
+ /// No match is ever possible, so don't ever try to search.
+ Nothing,
+}
+
+#[derive(Clone, Copy, Debug)]
+#[cfg(feature = "perf-literal")]
+enum MatchLiteralType {
+ /// Match literals anywhere in text.
+ Unanchored,
+ /// Match literals only at the start of text.
+ AnchoredStart,
+ /// Match literals only at the end of text.
+ AnchoredEnd,
+ /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
+ /// ExecReadOnly.
+ AhoCorasick,
+}
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+enum MatchNfaType {
+ /// Choose between Backtrack and PikeVM.
+ Auto,
+ /// NFA bounded backtracking.
+ ///
+ /// (This is only set by tests, since it never makes sense to always want
+ /// backtracking.)
+ Backtrack,
+ /// The Pike VM.
+ ///
+ /// (This is only set by tests, since it never makes sense to always want
+ /// the Pike VM.)
+ PikeVM,
+}
+
+/// `ProgramCache` maintains reusable allocations for each matching engine
+/// available to a particular program.
+///
+/// We declare this as unwind safe since it's a cache that's only used for
+/// performance purposes. If a panic occurs, it is (or should be) always safe
+/// to continue using the same regex object.
+pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
+
+#[derive(Debug)]
+pub struct ProgramCacheInner {
+ pub pikevm: pikevm::Cache,
+ pub backtrack: backtrack::Cache,
+ #[cfg(feature = "perf-dfa")]
+ pub dfa: dfa::Cache,
+ #[cfg(feature = "perf-dfa")]
+ pub dfa_reverse: dfa::Cache,
+}
+
+impl ProgramCacheInner {
+ fn new(ro: &ExecReadOnly) -> Self {
+ ProgramCacheInner {
+ pikevm: pikevm::Cache::new(&ro.nfa),
+ backtrack: backtrack::Cache::new(&ro.nfa),
+ #[cfg(feature = "perf-dfa")]
+ dfa: dfa::Cache::new(&ro.dfa),
+ #[cfg(feature = "perf-dfa")]
+ dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
+ }
+ }
+}
+
+/// Alternation literals checks if the given HIR is a simple alternation of
+/// literals, and if so, returns them. Otherwise, this returns None.
+#[cfg(feature = "perf-literal")]
+fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
+ use regex_syntax::hir::{HirKind, Literal};
+
+ // This is pretty hacky, but basically, if `is_alternation_literal` is
+ // true, then we can make several assumptions about the structure of our
+ // HIR. This is what justifies the `unreachable!` statements below.
+ //
+ // This code should be refactored once we overhaul this crate's
+ // optimization pipeline, because this is a terribly inflexible way to go
+ // about things.
+
+ if !expr.is_alternation_literal() {
+ return None;
+ }
+ let alts = match *expr.kind() {
+ HirKind::Alternation(ref alts) => alts,
+ _ => return None, // one literal isn't worth it
+ };
+
+ let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit {
+ Literal::Unicode(c) => {
+ let mut buf = [0; 4];
+ dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
+ }
+ Literal::Byte(b) => {
+ dst.push(b);
+ }
+ };
+
+ let mut lits = vec![];
+ for alt in alts {
+ let mut lit = vec![];
+ match *alt.kind() {
+ HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ HirKind::Concat(ref exprs) => {
+ for e in exprs {
+ match *e.kind() {
+ HirKind::Literal(ref x) => extendlit(x, &mut lit),
+ _ => unreachable!("expected literal, got {:?}", e),
+ }
+ }
+ }
+ _ => unreachable!("expected literal or concat, got {:?}", alt),
+ }
+ lits.push(lit);
+ }
+ Some(lits)
+}
+
+#[cfg(test)]
+mod test {
+ #[test]
+ fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
+ use crate::internal::ExecBuilder;
+
+ let backtrack_bytes_re = ExecBuilder::new("^S")
+ .bounded_backtracking()
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let default_bytes_re = ExecBuilder::new("^S")
+ .only_utf8(false)
+ .build()
+ .map(|exec| exec.into_byte_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let input = vec![83, 83];
+
+ let s1 = backtrack_bytes_re.split(&input);
+ let s2 = default_bytes_re.split(&input);
+ for (chunk1, chunk2) in s1.zip(s2) {
+ assert_eq!(chunk1, chunk2);
+ }
+ }
+
+ #[test]
+ fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
+ use crate::internal::ExecBuilder;
+
+ let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
+ .bounded_backtracking()
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
+ .bytes(true)
+ .build()
+ .map(|exec| exec.into_regex())
+ .map_err(|err| format!("{}", err))
+ .unwrap();
+
+ let input = "**";
+
+ let s1 = backtrack_bytes_re.split(input);
+ let s2 = default_bytes_re.split(input);
+ for (chunk1, chunk2) in s1.zip(s2) {
+ assert_eq!(chunk1, chunk2);
+ }
+ }
+}