summaryrefslogtreecommitdiffstats
path: root/vendor/regex/src/exec.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/regex/src/exec.rs')
-rw-r--r--vendor/regex/src/exec.rs1759
1 files changed, 0 insertions, 1759 deletions
diff --git a/vendor/regex/src/exec.rs b/vendor/regex/src/exec.rs
deleted file mode 100644
index ee8b589d2..000000000
--- a/vendor/regex/src/exec.rs
+++ /dev/null
@@ -1,1759 +0,0 @@
-use std::cell::RefCell;
-use std::collections::HashMap;
-use std::panic::AssertUnwindSafe;
-use std::sync::Arc;
-
-#[cfg(feature = "perf-literal")]
-use aho_corasick::{AhoCorasick, MatchKind};
-use regex_syntax::hir::literal;
-use regex_syntax::hir::{Hir, Look};
-use regex_syntax::ParserBuilder;
-
-use crate::backtrack;
-use crate::compile::Compiler;
-#[cfg(feature = "perf-dfa")]
-use crate::dfa;
-use crate::error::Error;
-use crate::input::{ByteInput, CharInput};
-use crate::literal::LiteralSearcher;
-use crate::pikevm;
-use crate::pool::{Pool, PoolGuard};
-use crate::prog::Program;
-use crate::re_builder::RegexOptions;
-use crate::re_bytes;
-use crate::re_set;
-use crate::re_trait::{Locations, RegularExpression, Slot};
-use crate::re_unicode;
-use crate::utf8::next_utf8;
-
-/// `Exec` manages the execution of a regular expression.
-///
-/// In particular, this manages the various compiled forms of a single regular
-/// expression and the choice of which matching engine to use to execute a
-/// regular expression.
-#[derive(Debug)]
-pub struct Exec {
- /// All read only state.
- ro: Arc<ExecReadOnly>,
- /// A pool of reusable values for the various matching engines.
- ///
- /// Note that boxing this value is not strictly necessary, but it is an
- /// easy way to ensure that T does not bloat the stack sized used by a pool
- /// in the case where T is big. And this turns out to be the case at the
- /// time of writing for regex's use of this pool. At the time of writing,
- /// the size of a Regex on the stack is 856 bytes. Boxing this value
- /// reduces that size to 16 bytes.
- pool: Box<Pool<ProgramCache>>,
-}
-
-/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This
-/// means it is no longer Sync, but we can now avoid the overhead of
-/// synchronization to fetch the cache.
-#[derive(Debug)]
-pub struct ExecNoSync<'c> {
- /// All read only state.
- ro: &'c Arc<ExecReadOnly>,
- /// Caches for the various matching engines.
- cache: PoolGuard<'c, ProgramCache>,
-}
-
-/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8].
-#[derive(Debug)]
-pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
-
-/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
-/// state is determined at compile time and never changes during search.
-#[derive(Debug)]
-struct ExecReadOnly {
- /// The original regular expressions given by the caller to compile.
- res: Vec<String>,
- /// A compiled program that is used in the NFA simulation and backtracking.
- /// It can be byte-based or Unicode codepoint based.
- ///
- /// N.B. It is not possibly to make this byte-based from the public API.
- /// It is only used for testing byte based programs in the NFA simulations.
- nfa: Program,
- /// A compiled byte based program for DFA execution. This is only used
- /// if a DFA can be executed. (Currently, only word boundary assertions are
- /// not supported.) Note that this program contains an embedded `.*?`
- /// preceding the first capture group, unless the regex is anchored at the
- /// beginning.
- #[allow(dead_code)]
- dfa: Program,
- /// The same as above, except the program is reversed (and there is no
- /// preceding `.*?`). This is used by the DFA to find the starting location
- /// of matches.
- #[allow(dead_code)]
- dfa_reverse: Program,
- /// A set of suffix literals extracted from the regex.
- ///
- /// Prefix literals are stored on the `Program`, since they are used inside
- /// the matching engines.
- #[allow(dead_code)]
- suffixes: LiteralSearcher,
- /// An Aho-Corasick automaton with leftmost-first match semantics.
- ///
- /// This is only set when the entire regex is a simple unanchored
- /// alternation of literals. We could probably use it more circumstances,
- /// but this is already hacky enough in this architecture.
- ///
- /// N.B. We use u32 as a state ID representation under the assumption that
- /// if we were to exhaust the ID space, we probably would have long
- /// surpassed the compilation size limit.
- #[cfg(feature = "perf-literal")]
- ac: Option<AhoCorasick>,
- /// match_type encodes as much upfront knowledge about how we're going to
- /// execute a search as possible.
- match_type: MatchType,
-}
-
-/// Facilitates the construction of an executor by exposing various knobs
-/// to control how a regex is executed and what kinds of resources it's
-/// permitted to use.
-// `ExecBuilder` is only public via the `internal` module, so avoid deriving
-// `Debug`.
-#[allow(missing_debug_implementations)]
-pub struct ExecBuilder {
- options: RegexOptions,
- match_type: Option<MatchType>,
- bytes: bool,
- only_utf8: bool,
-}
-
-/// Parsed represents a set of parsed regular expressions and their detected
-/// literals.
-struct Parsed {
- exprs: Vec<Hir>,
- prefixes: literal::Seq,
- suffixes: literal::Seq,
- bytes: bool,
-}
-
-impl ExecBuilder {
- /// Create a regex execution builder.
- ///
- /// This uses default settings for everything except the regex itself,
- /// which must be provided. Further knobs can be set by calling methods,
- /// and then finally, `build` to actually create the executor.
- pub fn new(re: &str) -> Self {
- Self::new_many(&[re])
- }
-
- /// Like new, but compiles the union of the given regular expressions.
- ///
- /// Note that when compiling 2 or more regular expressions, capture groups
- /// are completely unsupported. (This means both `find` and `captures`
- /// won't work.)
- pub fn new_many<I, S>(res: I) -> Self
- where
- S: AsRef<str>,
- I: IntoIterator<Item = S>,
- {
- let mut opts = RegexOptions::default();
- opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect();
- Self::new_options(opts)
- }
-
- /// Create a regex execution builder.
- pub fn new_options(opts: RegexOptions) -> Self {
- ExecBuilder {
- options: opts,
- match_type: None,
- bytes: false,
- only_utf8: true,
- }
- }
-
- /// Set the matching engine to be automatically determined.
- ///
- /// This is the default state and will apply whatever optimizations are
- /// possible, such as running a DFA.
- ///
- /// This overrides whatever was previously set via the `nfa` or
- /// `bounded_backtracking` methods.
- pub fn automatic(mut self) -> Self {
- self.match_type = None;
- self
- }
-
- /// Sets the matching engine to use the NFA algorithm no matter what
- /// optimizations are possible.
- ///
- /// This overrides whatever was previously set via the `automatic` or
- /// `bounded_backtracking` methods.
- pub fn nfa(mut self) -> Self {
- self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM));
- self
- }
-
- /// Sets the matching engine to use a bounded backtracking engine no
- /// matter what optimizations are possible.
- ///
- /// One must use this with care, since the bounded backtracking engine
- /// uses memory proportion to `len(regex) * len(text)`.
- ///
- /// This overrides whatever was previously set via the `automatic` or
- /// `nfa` methods.
- pub fn bounded_backtracking(mut self) -> Self {
- self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack));
- self
- }
-
- /// Compiles byte based programs for use with the NFA matching engines.
- ///
- /// By default, the NFA engines match on Unicode scalar values. They can
- /// be made to use byte based programs instead. In general, the byte based
- /// programs are slower because of a less efficient encoding of character
- /// classes.
- ///
- /// Note that this does not impact DFA matching engines, which always
- /// execute on bytes.
- pub fn bytes(mut self, yes: bool) -> Self {
- self.bytes = yes;
- self
- }
-
- /// When disabled, the program compiled may match arbitrary bytes.
- ///
- /// When enabled (the default), all compiled programs exclusively match
- /// valid UTF-8 bytes.
- pub fn only_utf8(mut self, yes: bool) -> Self {
- self.only_utf8 = yes;
- self
- }
-
- /// Set the Unicode flag.
- pub fn unicode(mut self, yes: bool) -> Self {
- self.options.unicode = yes;
- self
- }
-
- /// Parse the current set of patterns into their AST and extract literals.
- fn parse(&self) -> Result<Parsed, Error> {
- let mut exprs = Vec::with_capacity(self.options.pats.len());
- let mut prefixes = Some(literal::Seq::empty());
- let mut suffixes = Some(literal::Seq::empty());
- let mut bytes = false;
- let is_set = self.options.pats.len() > 1;
- // If we're compiling a regex set and that set has any anchored
- // expressions, then disable all literal optimizations.
- for pat in &self.options.pats {
- let mut parser = ParserBuilder::new()
- .octal(self.options.octal)
- .case_insensitive(self.options.case_insensitive)
- .multi_line(self.options.multi_line)
- .dot_matches_new_line(self.options.dot_matches_new_line)
- .swap_greed(self.options.swap_greed)
- .ignore_whitespace(self.options.ignore_whitespace)
- .unicode(self.options.unicode)
- .utf8(self.only_utf8)
- .nest_limit(self.options.nest_limit)
- .build();
- let expr =
- parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?;
- let props = expr.properties();
- // This used to just check whether the HIR matched valid UTF-8
- // or not, but in regex-syntax 0.7, we changed our definition of
- // "matches valid UTF-8" to exclude zero-width matches. And in
- // particular, previously, we considered WordAsciiNegate (that
- // is '(?-u:\B)') to be capable of matching invalid UTF-8. Our
- // matcher engines were built under this assumption and fixing
- // them is not worth it with the imminent plan to switch over to
- // regex-automata. So for now, we retain the previous behavior by
- // just explicitly treating the presence of a negated ASCII word
- // boundary as forcing use to use a byte oriented automaton.
- bytes = bytes
- || !props.is_utf8()
- || props.look_set().contains(Look::WordAsciiNegate);
-
- if cfg!(feature = "perf-literal") {
- if !props.look_set_prefix().contains(Look::Start)
- && props.look_set().contains(Look::Start)
- {
- // Partial anchors unfortunately make it hard to use
- // prefixes, so disable them.
- prefixes = None;
- } else if is_set
- && props.look_set_prefix_any().contains(Look::Start)
- {
- // Regex sets with anchors do not go well with literal
- // optimizations.
- prefixes = None;
- } else if props.look_set_prefix_any().contains_word() {
- // The new literal extractor ignores look-around while
- // the old one refused to extract prefixes from regexes
- // that began with a \b. These old creaky regex internals
- // can't deal with it, so we drop it.
- prefixes = None;
- } else if props.look_set_prefix_any().contains(Look::StartLF) {
- // Similar to the reasoning for word boundaries, this old
- // regex engine can't handle literal prefixes with '(?m:^)'
- // at the beginning of a regex.
- prefixes = None;
- }
-
- if !props.look_set_suffix().contains(Look::End)
- && props.look_set().contains(Look::End)
- {
- // Partial anchors unfortunately make it hard to use
- // suffixes, so disable them.
- suffixes = None;
- } else if is_set
- && props.look_set_suffix_any().contains(Look::End)
- {
- // Regex sets with anchors do not go well with literal
- // optimizations.
- suffixes = None;
- } else if props.look_set_suffix_any().contains_word() {
- // See the prefix case for reasoning here.
- suffixes = None;
- } else if props.look_set_suffix_any().contains(Look::EndLF) {
- // See the prefix case for reasoning here.
- suffixes = None;
- }
-
- let (mut pres, mut suffs) =
- if prefixes.is_none() && suffixes.is_none() {
- (literal::Seq::infinite(), literal::Seq::infinite())
- } else {
- literal_analysis(&expr)
- };
- // These old creaky regex internals can't handle cases where
- // the literal sequences are exact but there are look-around
- // assertions. So we make sure the sequences are inexact if
- // there are look-around assertions anywhere. This forces the
- // regex engines to run instead of assuming that a literal
- // match implies an overall match.
- if !props.look_set().is_empty() {
- pres.make_inexact();
- suffs.make_inexact();
- }
- prefixes = prefixes.and_then(|mut prefixes| {
- prefixes.union(&mut pres);
- Some(prefixes)
- });
- suffixes = suffixes.and_then(|mut suffixes| {
- suffixes.union(&mut suffs);
- Some(suffixes)
- });
- }
- exprs.push(expr);
- }
- Ok(Parsed {
- exprs,
- prefixes: prefixes.unwrap_or_else(literal::Seq::empty),
- suffixes: suffixes.unwrap_or_else(literal::Seq::empty),
- bytes,
- })
- }
-
- /// Build an executor that can run a regular expression.
- pub fn build(self) -> Result<Exec, Error> {
- // Special case when we have no patterns to compile.
- // This can happen when compiling a regex set.
- if self.options.pats.is_empty() {
- let ro = Arc::new(ExecReadOnly {
- res: vec![],
- nfa: Program::new(),
- dfa: Program::new(),
- dfa_reverse: Program::new(),
- suffixes: LiteralSearcher::empty(),
- #[cfg(feature = "perf-literal")]
- ac: None,
- match_type: MatchType::Nothing,
- });
- let pool = ExecReadOnly::new_pool(&ro);
- return Ok(Exec { ro, pool });
- }
- let parsed = self.parse()?;
- let mut nfa = Compiler::new()
- .size_limit(self.options.size_limit)
- .bytes(self.bytes || parsed.bytes)
- .only_utf8(self.only_utf8)
- .compile(&parsed.exprs)?;
- let mut dfa = Compiler::new()
- .size_limit(self.options.size_limit)
- .dfa(true)
- .only_utf8(self.only_utf8)
- .compile(&parsed.exprs)?;
- let mut dfa_reverse = Compiler::new()
- .size_limit(self.options.size_limit)
- .dfa(true)
- .only_utf8(self.only_utf8)
- .reverse(true)
- .compile(&parsed.exprs)?;
-
- #[cfg(feature = "perf-literal")]
- let ac = self.build_aho_corasick(&parsed);
- nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
- dfa.prefixes = nfa.prefixes.clone();
- dfa.dfa_size_limit = self.options.dfa_size_limit;
- dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
-
- let mut ro = ExecReadOnly {
- res: self.options.pats,
- nfa,
- dfa,
- dfa_reverse,
- suffixes: LiteralSearcher::suffixes(parsed.suffixes),
- #[cfg(feature = "perf-literal")]
- ac,
- match_type: MatchType::Nothing,
- };
- ro.match_type = ro.choose_match_type(self.match_type);
-
- let ro = Arc::new(ro);
- let pool = ExecReadOnly::new_pool(&ro);
- Ok(Exec { ro, pool })
- }
-
- #[cfg(feature = "perf-literal")]
- fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick> {
- if parsed.exprs.len() != 1 {
- return None;
- }
- let lits = match alternation_literals(&parsed.exprs[0]) {
- None => return None,
- Some(lits) => lits,
- };
- // If we have a small number of literals, then let Teddy handle
- // things (see literal/mod.rs).
- if lits.len() <= 32 {
- return None;
- }
- Some(
- AhoCorasick::builder()
- .match_kind(MatchKind::LeftmostFirst)
- .build(&lits)
- // This should never happen because we'd long exceed the
- // compilation limit for regexes first.
- .expect("AC automaton too big"),
- )
- }
-}
-
-impl<'c> RegularExpression for ExecNoSyncStr<'c> {
- type Text = str;
-
- fn slots_len(&self) -> usize {
- self.0.slots_len()
- }
-
- fn next_after_empty(&self, text: &str, i: usize) -> usize {
- next_utf8(text.as_bytes(), i)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> {
- self.0.shortest_match_at(text.as_bytes(), start)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn is_match_at(&self, text: &str, start: usize) -> bool {
- self.0.is_match_at(text.as_bytes(), start)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
- self.0.find_at(text.as_bytes(), start)
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn captures_read_at(
- &self,
- locs: &mut Locations,
- text: &str,
- start: usize,
- ) -> Option<(usize, usize)> {
- self.0.captures_read_at(locs, text.as_bytes(), start)
- }
-}
-
-impl<'c> RegularExpression for ExecNoSync<'c> {
- type Text = [u8];
-
- /// Returns the number of capture slots in the regular expression. (There
- /// are two slots for every capture group, corresponding to possibly empty
- /// start and end locations of the capture.)
- fn slots_len(&self) -> usize {
- self.ro.nfa.captures.len() * 2
- }
-
- fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
- i + 1
- }
-
- /// Returns the end of a match location, possibly occurring before the
- /// end location of the correct leftmost-first match.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> {
- if !self.is_anchor_end_match(text) {
- return None;
- }
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- MatchType::Literal(ty) => {
- self.find_literals(ty, text, start).map(|(_, e)| e)
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::Dfa | MatchType::DfaMany => {
- match self.shortest_dfa(text, start) {
- dfa::Result::Match(end) => Some(end),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.shortest_nfa(text, start),
- }
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaAnchoredReverse => {
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- true,
- &text[start..],
- text.len() - start,
- ) {
- dfa::Result::Match(_) => Some(text.len()),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.shortest_nfa(text, start),
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- MatchType::DfaSuffix => {
- match self.shortest_dfa_reverse_suffix(text, start) {
- dfa::Result::Match(e) => Some(e),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.shortest_nfa(text, start),
- }
- }
- MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start),
- MatchType::Nothing => None,
- }
- }
-
- /// Returns true if and only if the regex matches text.
- ///
- /// For single regular expressions, this is equivalent to calling
- /// shortest_match(...).is_some().
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn is_match_at(&self, text: &[u8], start: usize) -> bool {
- if !self.is_anchor_end_match(text) {
- return false;
- }
- // We need to do this dance because shortest_match relies on the NFA
- // filling in captures[1], but a RegexSet has no captures. In other
- // words, a RegexSet can't (currently) use shortest_match. ---AG
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- MatchType::Literal(ty) => {
- self.find_literals(ty, text, start).is_some()
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::Dfa | MatchType::DfaMany => {
- match self.shortest_dfa(text, start) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.match_nfa(text, start),
- }
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaAnchoredReverse => {
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- true,
- &text[start..],
- text.len() - start,
- ) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.match_nfa(text, start),
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- MatchType::DfaSuffix => {
- match self.shortest_dfa_reverse_suffix(text, start) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.match_nfa(text, start),
- }
- }
- MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start),
- MatchType::Nothing => false,
- }
- }
-
- /// Finds the start and end location of the leftmost-first match, starting
- /// at the given location.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> {
- if !self.is_anchor_end_match(text) {
- return None;
- }
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- MatchType::Literal(ty) => self.find_literals(ty, text, start),
- #[cfg(feature = "perf-dfa")]
- MatchType::Dfa => match self.find_dfa_forward(text, start) {
- dfa::Result::Match((s, e)) => Some((s, e)),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => {
- self.find_nfa(MatchNfaType::Auto, text, start)
- }
- },
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaAnchoredReverse => {
- match self.find_dfa_anchored_reverse(text, start) {
- dfa::Result::Match((s, e)) => Some((s, e)),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => {
- self.find_nfa(MatchNfaType::Auto, text, start)
- }
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- MatchType::DfaSuffix => {
- match self.find_dfa_reverse_suffix(text, start) {
- dfa::Result::Match((s, e)) => Some((s, e)),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => {
- self.find_nfa(MatchNfaType::Auto, text, start)
- }
- }
- }
- MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
- MatchType::Nothing => None,
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaMany => {
- unreachable!("BUG: RegexSet cannot be used with find")
- }
- }
- }
-
- /// Finds the start and end location of the leftmost-first match and also
- /// fills in all matching capture groups.
- ///
- /// The number of capture slots given should be equal to the total number
- /// of capture slots in the compiled program.
- ///
- /// Note that the first two slots always correspond to the start and end
- /// locations of the overall match.
- fn captures_read_at(
- &self,
- locs: &mut Locations,
- text: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- let slots = locs.as_slots();
- for slot in slots.iter_mut() {
- *slot = None;
- }
- // If the caller unnecessarily uses this, then we try to save them
- // from themselves.
- match slots.len() {
- 0 => return self.find_at(text, start),
- 2 => {
- return self.find_at(text, start).map(|(s, e)| {
- slots[0] = Some(s);
- slots[1] = Some(e);
- (s, e)
- });
- }
- _ => {} // fallthrough
- }
- if !self.is_anchor_end_match(text) {
- return None;
- }
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- MatchType::Literal(ty) => {
- self.find_literals(ty, text, start).and_then(|(s, e)| {
- self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- s,
- e,
- )
- })
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::Dfa => {
- if self.ro.nfa.is_anchored_start {
- self.captures_nfa(slots, text, start)
- } else {
- match self.find_dfa_forward(text, start) {
- dfa::Result::Match((s, e)) => self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- s,
- e,
- ),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => {
- self.captures_nfa(slots, text, start)
- }
- }
- }
- }
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaAnchoredReverse => {
- match self.find_dfa_anchored_reverse(text, start) {
- dfa::Result::Match((s, e)) => self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- s,
- e,
- ),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.captures_nfa(slots, text, start),
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- MatchType::DfaSuffix => {
- match self.find_dfa_reverse_suffix(text, start) {
- dfa::Result::Match((s, e)) => self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- s,
- e,
- ),
- dfa::Result::NoMatch(_) => None,
- dfa::Result::Quit => self.captures_nfa(slots, text, start),
- }
- }
- MatchType::Nfa(ty) => {
- self.captures_nfa_type(ty, slots, text, start, text.len())
- }
- MatchType::Nothing => None,
- #[cfg(feature = "perf-dfa")]
- MatchType::DfaMany => {
- unreachable!("BUG: RegexSet cannot be used with captures")
- }
- }
- }
-}
-
-impl<'c> ExecNoSync<'c> {
- /// Finds the leftmost-first match using only literal search.
- #[cfg(feature = "perf-literal")]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_literals(
- &self,
- ty: MatchLiteralType,
- text: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- use self::MatchLiteralType::*;
- match ty {
- Unanchored => {
- let lits = &self.ro.nfa.prefixes;
- lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
- }
- AnchoredStart => {
- let lits = &self.ro.nfa.prefixes;
- if start == 0 || !self.ro.nfa.is_anchored_start {
- lits.find_start(&text[start..])
- .map(|(s, e)| (start + s, start + e))
- } else {
- None
- }
- }
- AnchoredEnd => {
- let lits = &self.ro.suffixes;
- lits.find_end(&text[start..])
- .map(|(s, e)| (start + s, start + e))
- }
- AhoCorasick => self
- .ro
- .ac
- .as_ref()
- .unwrap()
- .find(&text[start..])
- .map(|m| (start + m.start(), start + m.end())),
- }
- }
-
- /// Finds the leftmost-first match (start and end) using only the DFA.
- ///
- /// If the result returned indicates that the DFA quit, then another
- /// matching engine should be used.
- #[cfg(feature = "perf-dfa")]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_dfa_forward(
- &self,
- text: &[u8],
- start: usize,
- ) -> dfa::Result<(usize, usize)> {
- use crate::dfa::Result::*;
- let end = match dfa::Fsm::forward(
- &self.ro.dfa,
- self.cache.value(),
- false,
- text,
- start,
- ) {
- NoMatch(i) => return NoMatch(i),
- Quit => return Quit,
- Match(end) if start == end => return Match((start, start)),
- Match(end) => end,
- };
- // Now run the DFA in reverse to find the start of the match.
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- false,
- &text[start..],
- end - start,
- ) {
- Match(s) => Match((start + s, end)),
- NoMatch(i) => NoMatch(i),
- Quit => Quit,
- }
- }
-
- /// Finds the leftmost-first match (start and end) using only the DFA,
- /// but assumes the regex is anchored at the end and therefore starts at
- /// the end of the regex and matches in reverse.
- ///
- /// If the result returned indicates that the DFA quit, then another
- /// matching engine should be used.
- #[cfg(feature = "perf-dfa")]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_dfa_anchored_reverse(
- &self,
- text: &[u8],
- start: usize,
- ) -> dfa::Result<(usize, usize)> {
- use crate::dfa::Result::*;
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- false,
- &text[start..],
- text.len() - start,
- ) {
- Match(s) => Match((start + s, text.len())),
- NoMatch(i) => NoMatch(i),
- Quit => Quit,
- }
- }
-
- /// Finds the end of the shortest match using only the DFA.
- #[cfg(feature = "perf-dfa")]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> {
- dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start)
- }
-
- /// Finds the end of the shortest match using only the DFA by scanning for
- /// suffix literals.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn shortest_dfa_reverse_suffix(
- &self,
- text: &[u8],
- start: usize,
- ) -> dfa::Result<usize> {
- match self.exec_dfa_reverse_suffix(text, start) {
- None => self.shortest_dfa(text, start),
- Some(r) => r.map(|(_, end)| end),
- }
- }
-
- /// Finds the end of the shortest match using only the DFA by scanning for
- /// suffix literals. It also reports the start of the match.
- ///
- /// Note that if None is returned, then the optimization gave up to avoid
- /// worst case quadratic behavior. A forward scanning DFA should be tried
- /// next.
- ///
- /// If a match is returned and the full leftmost-first match is desired,
- /// then a forward scan starting from the beginning of the match must be
- /// done.
- ///
- /// If the result returned indicates that the DFA quit, then another
- /// matching engine should be used.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn exec_dfa_reverse_suffix(
- &self,
- text: &[u8],
- original_start: usize,
- ) -> Option<dfa::Result<(usize, usize)>> {
- use crate::dfa::Result::*;
-
- let lcs = self.ro.suffixes.lcs();
- debug_assert!(lcs.len() >= 1);
- let mut start = original_start;
- let mut end = start;
- let mut last_literal = start;
- while end <= text.len() {
- last_literal += match lcs.find(&text[last_literal..]) {
- None => return Some(NoMatch(text.len())),
- Some(i) => i,
- };
- end = last_literal + lcs.len();
- match dfa::Fsm::reverse(
- &self.ro.dfa_reverse,
- self.cache.value(),
- false,
- &text[start..end],
- end - start,
- ) {
- Match(0) | NoMatch(0) => return None,
- Match(i) => return Some(Match((start + i, end))),
- NoMatch(i) => {
- start += i;
- last_literal += 1;
- continue;
- }
- Quit => return Some(Quit),
- };
- }
- Some(NoMatch(text.len()))
- }
-
- /// Finds the leftmost-first match (start and end) using only the DFA
- /// by scanning for suffix literals.
- ///
- /// If the result returned indicates that the DFA quit, then another
- /// matching engine should be used.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn find_dfa_reverse_suffix(
- &self,
- text: &[u8],
- start: usize,
- ) -> dfa::Result<(usize, usize)> {
- use crate::dfa::Result::*;
-
- let match_start = match self.exec_dfa_reverse_suffix(text, start) {
- None => return self.find_dfa_forward(text, start),
- Some(Match((start, _))) => start,
- Some(r) => return r,
- };
- // At this point, we've found a match. The only way to quit now
- // without a match is if the DFA gives up (seems unlikely).
- //
- // Now run the DFA forwards to find the proper end of the match.
- // (The suffix literal match can only indicate the earliest
- // possible end location, which may appear before the end of the
- // leftmost-first match.)
- match dfa::Fsm::forward(
- &self.ro.dfa,
- self.cache.value(),
- false,
- text,
- match_start,
- ) {
- NoMatch(_) => panic!("BUG: reverse match implies forward match"),
- Quit => Quit,
- Match(e) => Match((match_start, e)),
- }
- }
-
- /// Executes the NFA engine to return whether there is a match or not.
- ///
- /// Ideally, we could use shortest_nfa(...).is_some() and get the same
- /// performance characteristics, but regex sets don't have captures, which
- /// shortest_nfa depends on.
- #[cfg(feature = "perf-dfa")]
- fn match_nfa(&self, text: &[u8], start: usize) -> bool {
- self.match_nfa_type(MatchNfaType::Auto, text, start)
- }
-
- /// Like match_nfa, but allows specification of the type of NFA engine.
- fn match_nfa_type(
- &self,
- ty: MatchNfaType,
- text: &[u8],
- start: usize,
- ) -> bool {
- self.exec_nfa(
- ty,
- &mut [false],
- &mut [],
- true,
- false,
- text,
- start,
- text.len(),
- )
- }
-
- /// Finds the shortest match using an NFA.
- #[cfg(feature = "perf-dfa")]
- fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> {
- self.shortest_nfa_type(MatchNfaType::Auto, text, start)
- }
-
- /// Like shortest_nfa, but allows specification of the type of NFA engine.
- fn shortest_nfa_type(
- &self,
- ty: MatchNfaType,
- text: &[u8],
- start: usize,
- ) -> Option<usize> {
- let mut slots = [None, None];
- if self.exec_nfa(
- ty,
- &mut [false],
- &mut slots,
- true,
- true,
- text,
- start,
- text.len(),
- ) {
- slots[1]
- } else {
- None
- }
- }
-
- /// Like find, but executes an NFA engine.
- fn find_nfa(
- &self,
- ty: MatchNfaType,
- text: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- let mut slots = [None, None];
- if self.exec_nfa(
- ty,
- &mut [false],
- &mut slots,
- false,
- false,
- text,
- start,
- text.len(),
- ) {
- match (slots[0], slots[1]) {
- (Some(s), Some(e)) => Some((s, e)),
- _ => None,
- }
- } else {
- None
- }
- }
-
- /// Like find_nfa, but fills in captures.
- ///
- /// `slots` should have length equal to `2 * nfa.captures.len()`.
- #[cfg(feature = "perf-dfa")]
- fn captures_nfa(
- &self,
- slots: &mut [Slot],
- text: &[u8],
- start: usize,
- ) -> Option<(usize, usize)> {
- self.captures_nfa_type(
- MatchNfaType::Auto,
- slots,
- text,
- start,
- text.len(),
- )
- }
-
- /// Like captures_nfa, but allows specification of type of NFA engine.
- fn captures_nfa_type(
- &self,
- ty: MatchNfaType,
- slots: &mut [Slot],
- text: &[u8],
- start: usize,
- end: usize,
- ) -> Option<(usize, usize)> {
- if self.exec_nfa(
- ty,
- &mut [false],
- slots,
- false,
- false,
- text,
- start,
- end,
- ) {
- match (slots[0], slots[1]) {
- (Some(s), Some(e)) => Some((s, e)),
- _ => None,
- }
- } else {
- None
- }
- }
-
- fn exec_nfa(
- &self,
- mut ty: MatchNfaType,
- matches: &mut [bool],
- slots: &mut [Slot],
- quit_after_match: bool,
- quit_after_match_with_pos: bool,
- text: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- use self::MatchNfaType::*;
- if let Auto = ty {
- if backtrack::should_exec(self.ro.nfa.len(), text.len()) {
- ty = Backtrack;
- } else {
- ty = PikeVM;
- }
- }
- // The backtracker can't return the shortest match position as it is
- // implemented today. So if someone calls `shortest_match` and we need
- // to run an NFA, then use the PikeVM.
- if quit_after_match_with_pos || ty == PikeVM {
- self.exec_pikevm(
- matches,
- slots,
- quit_after_match,
- text,
- start,
- end,
- )
- } else {
- self.exec_backtrack(matches, slots, text, start, end)
- }
- }
-
- /// Always run the NFA algorithm.
- fn exec_pikevm(
- &self,
- matches: &mut [bool],
- slots: &mut [Slot],
- quit_after_match: bool,
- text: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- if self.ro.nfa.uses_bytes() {
- pikevm::Fsm::exec(
- &self.ro.nfa,
- self.cache.value(),
- matches,
- slots,
- quit_after_match,
- ByteInput::new(text, self.ro.nfa.only_utf8),
- start,
- end,
- )
- } else {
- pikevm::Fsm::exec(
- &self.ro.nfa,
- self.cache.value(),
- matches,
- slots,
- quit_after_match,
- CharInput::new(text),
- start,
- end,
- )
- }
- }
-
- /// Always runs the NFA using bounded backtracking.
- fn exec_backtrack(
- &self,
- matches: &mut [bool],
- slots: &mut [Slot],
- text: &[u8],
- start: usize,
- end: usize,
- ) -> bool {
- if self.ro.nfa.uses_bytes() {
- backtrack::Bounded::exec(
- &self.ro.nfa,
- self.cache.value(),
- matches,
- slots,
- ByteInput::new(text, self.ro.nfa.only_utf8),
- start,
- end,
- )
- } else {
- backtrack::Bounded::exec(
- &self.ro.nfa,
- self.cache.value(),
- matches,
- slots,
- CharInput::new(text),
- start,
- end,
- )
- }
- }
-
- /// Finds which regular expressions match the given text.
- ///
- /// `matches` should have length equal to the number of regexes being
- /// searched.
- ///
- /// This is only useful when one wants to know which regexes in a set
- /// match some text.
- pub fn many_matches_at(
- &self,
- matches: &mut [bool],
- text: &[u8],
- start: usize,
- ) -> bool {
- use self::MatchType::*;
- if !self.is_anchor_end_match(text) {
- return false;
- }
- match self.ro.match_type {
- #[cfg(feature = "perf-literal")]
- Literal(ty) => {
- debug_assert_eq!(matches.len(), 1);
- matches[0] = self.find_literals(ty, text, start).is_some();
- matches[0]
- }
- #[cfg(feature = "perf-dfa")]
- Dfa | DfaAnchoredReverse | DfaMany => {
- match dfa::Fsm::forward_many(
- &self.ro.dfa,
- self.cache.value(),
- matches,
- text,
- start,
- ) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.exec_nfa(
- MatchNfaType::Auto,
- matches,
- &mut [],
- false,
- false,
- text,
- start,
- text.len(),
- ),
- }
- }
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- DfaSuffix => {
- match dfa::Fsm::forward_many(
- &self.ro.dfa,
- self.cache.value(),
- matches,
- text,
- start,
- ) {
- dfa::Result::Match(_) => true,
- dfa::Result::NoMatch(_) => false,
- dfa::Result::Quit => self.exec_nfa(
- MatchNfaType::Auto,
- matches,
- &mut [],
- false,
- false,
- text,
- start,
- text.len(),
- ),
- }
- }
- Nfa(ty) => self.exec_nfa(
- ty,
- matches,
- &mut [],
- false,
- false,
- text,
- start,
- text.len(),
- ),
- Nothing => false,
- }
- }
-
- #[cfg_attr(feature = "perf-inline", inline(always))]
- fn is_anchor_end_match(&self, text: &[u8]) -> bool {
- #[cfg(not(feature = "perf-literal"))]
- fn imp(_: &ExecReadOnly, _: &[u8]) -> bool {
- true
- }
-
- #[cfg(feature = "perf-literal")]
- fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
- // Only do this check if the haystack is big (>1MB).
- if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
- let lcs = ro.suffixes.lcs();
- if lcs.len() >= 1 && !lcs.is_suffix(text) {
- return false;
- }
- }
- true
- }
-
- imp(&self.ro, text)
- }
-
- pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
- &self.ro.nfa.capture_name_idx
- }
-}
-
-impl<'c> ExecNoSyncStr<'c> {
- pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
- self.0.capture_name_idx()
- }
-}
-
-impl Exec {
- /// Get a searcher that isn't Sync.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn searcher(&self) -> ExecNoSync<'_> {
- ExecNoSync {
- ro: &self.ro, // a clone is too expensive here! (and not needed)
- cache: self.pool.get(),
- }
- }
-
- /// Get a searcher that isn't Sync and can match on &str.
- #[cfg_attr(feature = "perf-inline", inline(always))]
- pub fn searcher_str(&self) -> ExecNoSyncStr<'_> {
- ExecNoSyncStr(self.searcher())
- }
-
- /// Build a Regex from this executor.
- pub fn into_regex(self) -> re_unicode::Regex {
- re_unicode::Regex::from(self)
- }
-
- /// Build a RegexSet from this executor.
- pub fn into_regex_set(self) -> re_set::unicode::RegexSet {
- re_set::unicode::RegexSet::from(self)
- }
-
- /// Build a Regex from this executor that can match arbitrary bytes.
- pub fn into_byte_regex(self) -> re_bytes::Regex {
- re_bytes::Regex::from(self)
- }
-
- /// Build a RegexSet from this executor that can match arbitrary bytes.
- pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet {
- re_set::bytes::RegexSet::from(self)
- }
-
- /// The original regular expressions given by the caller that were
- /// compiled.
- pub fn regex_strings(&self) -> &[String] {
- &self.ro.res
- }
-
- /// Return a slice of capture names.
- ///
- /// Any capture that isn't named is None.
- pub fn capture_names(&self) -> &[Option<String>] {
- &self.ro.nfa.captures
- }
-
- /// Return a reference to named groups mapping (from group name to
- /// group position).
- pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
- &self.ro.nfa.capture_name_idx
- }
-
- /// If the number of capture groups in every match is always the same, then
- /// return that number. Otherwise return `None`.
- pub fn static_captures_len(&self) -> Option<usize> {
- self.ro.nfa.static_captures_len
- }
-}
-
-impl Clone for Exec {
- fn clone(&self) -> Exec {
- let pool = ExecReadOnly::new_pool(&self.ro);
- Exec { ro: self.ro.clone(), pool }
- }
-}
-
-impl ExecReadOnly {
- fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType {
- if let Some(MatchType::Nfa(_)) = hint {
- return hint.unwrap();
- }
- // If the NFA is empty, then we'll never match anything.
- if self.nfa.insts.is_empty() {
- return MatchType::Nothing;
- }
- if let Some(literalty) = self.choose_literal_match_type() {
- return literalty;
- }
- if let Some(dfaty) = self.choose_dfa_match_type() {
- return dfaty;
- }
- // We're so totally hosed.
- MatchType::Nfa(MatchNfaType::Auto)
- }
-
- /// If a plain literal scan can be used, then a corresponding literal
- /// search type is returned.
- fn choose_literal_match_type(&self) -> Option<MatchType> {
- #[cfg(not(feature = "perf-literal"))]
- fn imp(_: &ExecReadOnly) -> Option<MatchType> {
- None
- }
-
- #[cfg(feature = "perf-literal")]
- fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
- // If our set of prefixes is complete, then we can use it to find
- // a match in lieu of a regex engine. This doesn't quite work well
- // in the presence of multiple regexes, so only do it when there's
- // one.
- //
- // TODO(burntsushi): Also, don't try to match literals if the regex
- // is partially anchored. We could technically do it, but we'd need
- // to create two sets of literals: all of them and then the subset
- // that aren't anchored. We would then only search for all of them
- // when at the beginning of the input and use the subset in all
- // other cases.
- if ro.res.len() != 1 {
- return None;
- }
- if ro.ac.is_some() {
- return Some(MatchType::Literal(
- MatchLiteralType::AhoCorasick,
- ));
- }
- if ro.nfa.prefixes.complete() {
- return if ro.nfa.is_anchored_start {
- Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
- } else {
- Some(MatchType::Literal(MatchLiteralType::Unanchored))
- };
- }
- if ro.suffixes.complete() {
- return if ro.nfa.is_anchored_end {
- Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
- } else {
- // This case shouldn't happen. When the regex isn't
- // anchored, then complete prefixes should imply complete
- // suffixes.
- //
- // The above is wrong! This case can happen. While
- // complete prefixes should imply complete suffixes
- // here, that doesn't necessarily mean we have a useful
- // prefix matcher! It could be the case that the literal
- // searcher decided the prefixes---even though they are
- // "complete"---weren't good enough and thus created an
- // empty matcher. If that happens and we return Unanchored
- // here, then we'll end up using that matcher, which is
- // very bad because it matches at every position. So...
- // return None.
- None
- };
- }
- None
- }
-
- imp(self)
- }
-
- /// If a DFA scan can be used, then choose the appropriate DFA strategy.
- fn choose_dfa_match_type(&self) -> Option<MatchType> {
- #[cfg(not(feature = "perf-dfa"))]
- fn imp(_: &ExecReadOnly) -> Option<MatchType> {
- None
- }
-
- #[cfg(feature = "perf-dfa")]
- fn imp(ro: &ExecReadOnly) -> Option<MatchType> {
- if !dfa::can_exec(&ro.dfa) {
- return None;
- }
- // Regex sets require a slightly specialized path.
- if ro.res.len() >= 2 {
- return Some(MatchType::DfaMany);
- }
- // If the regex is anchored at the end but not the start, then
- // just match in reverse from the end of the haystack.
- if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
- return Some(MatchType::DfaAnchoredReverse);
- }
- #[cfg(feature = "perf-literal")]
- {
- // If there's a longish suffix literal, then it might be faster
- // to look for that first.
- if ro.should_suffix_scan() {
- return Some(MatchType::DfaSuffix);
- }
- }
- // Fall back to your garden variety forward searching lazy DFA.
- Some(MatchType::Dfa)
- }
-
- imp(self)
- }
-
- /// Returns true if the program is amenable to suffix scanning.
- ///
- /// When this is true, as a heuristic, we assume it is OK to quickly scan
- /// for suffix literals and then do a *reverse* DFA match from any matches
- /// produced by the literal scan. (And then followed by a forward DFA
- /// search, since the previously found suffix literal maybe not actually be
- /// the end of a match.)
- ///
- /// This is a bit of a specialized optimization, but can result in pretty
- /// big performance wins if 1) there are no prefix literals and 2) the
- /// suffix literals are pretty rare in the text. (1) is obviously easy to
- /// account for but (2) is harder. As a proxy, we assume that longer
- /// strings are generally rarer, so we only enable this optimization when
- /// we have a meaty suffix.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- fn should_suffix_scan(&self) -> bool {
- if self.suffixes.is_empty() {
- return false;
- }
- let lcs_len = self.suffixes.lcs().char_len();
- lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
- }
-
- fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> {
- let ro = ro.clone();
- Box::new(Pool::new(Box::new(move || {
- AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro)))
- })))
- }
-}
-
-#[derive(Clone, Copy, Debug)]
-enum MatchType {
- /// A single or multiple literal search. This is only used when the regex
- /// can be decomposed into a literal search.
- #[cfg(feature = "perf-literal")]
- Literal(MatchLiteralType),
- /// A normal DFA search.
- #[cfg(feature = "perf-dfa")]
- Dfa,
- /// A reverse DFA search starting from the end of a haystack.
- #[cfg(feature = "perf-dfa")]
- DfaAnchoredReverse,
- /// A reverse DFA search with suffix literal scanning.
- #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))]
- DfaSuffix,
- /// Use the DFA on two or more regular expressions.
- #[cfg(feature = "perf-dfa")]
- DfaMany,
- /// An NFA variant.
- Nfa(MatchNfaType),
- /// No match is ever possible, so don't ever try to search.
- Nothing,
-}
-
-#[derive(Clone, Copy, Debug)]
-#[cfg(feature = "perf-literal")]
-enum MatchLiteralType {
- /// Match literals anywhere in text.
- Unanchored,
- /// Match literals only at the start of text.
- AnchoredStart,
- /// Match literals only at the end of text.
- AnchoredEnd,
- /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
- /// ExecReadOnly.
- AhoCorasick,
-}
-
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
-enum MatchNfaType {
- /// Choose between Backtrack and PikeVM.
- Auto,
- /// NFA bounded backtracking.
- ///
- /// (This is only set by tests, since it never makes sense to always want
- /// backtracking.)
- Backtrack,
- /// The Pike VM.
- ///
- /// (This is only set by tests, since it never makes sense to always want
- /// the Pike VM.)
- PikeVM,
-}
-
-/// `ProgramCache` maintains reusable allocations for each matching engine
-/// available to a particular program.
-///
-/// We declare this as unwind safe since it's a cache that's only used for
-/// performance purposes. If a panic occurs, it is (or should be) always safe
-/// to continue using the same regex object.
-pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>;
-
-#[derive(Debug)]
-pub struct ProgramCacheInner {
- pub pikevm: pikevm::Cache,
- pub backtrack: backtrack::Cache,
- #[cfg(feature = "perf-dfa")]
- pub dfa: dfa::Cache,
- #[cfg(feature = "perf-dfa")]
- pub dfa_reverse: dfa::Cache,
-}
-
-impl ProgramCacheInner {
- fn new(ro: &ExecReadOnly) -> Self {
- ProgramCacheInner {
- pikevm: pikevm::Cache::new(&ro.nfa),
- backtrack: backtrack::Cache::new(&ro.nfa),
- #[cfg(feature = "perf-dfa")]
- dfa: dfa::Cache::new(&ro.dfa),
- #[cfg(feature = "perf-dfa")]
- dfa_reverse: dfa::Cache::new(&ro.dfa_reverse),
- }
- }
-}
-
-/// Alternation literals checks if the given HIR is a simple alternation of
-/// literals, and if so, returns them. Otherwise, this returns None.
-#[cfg(feature = "perf-literal")]
-fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
- use regex_syntax::hir::{HirKind, Literal};
-
- // This is pretty hacky, but basically, if `is_alternation_literal` is
- // true, then we can make several assumptions about the structure of our
- // HIR. This is what justifies the `unreachable!` statements below.
- //
- // This code should be refactored once we overhaul this crate's
- // optimization pipeline, because this is a terribly inflexible way to go
- // about things.
-
- if !expr.properties().is_alternation_literal() {
- return None;
- }
- let alts = match *expr.kind() {
- HirKind::Alternation(ref alts) => alts,
- _ => return None, // one literal isn't worth it
- };
-
- let mut lits = vec![];
- for alt in alts {
- let mut lit = vec![];
- match *alt.kind() {
- HirKind::Literal(Literal(ref bytes)) => {
- lit.extend_from_slice(bytes)
- }
- HirKind::Concat(ref exprs) => {
- for e in exprs {
- match *e.kind() {
- HirKind::Literal(Literal(ref bytes)) => {
- lit.extend_from_slice(bytes);
- }
- _ => unreachable!("expected literal, got {:?}", e),
- }
- }
- }
- _ => unreachable!("expected literal or concat, got {:?}", alt),
- }
- lits.push(lit);
- }
- Some(lits)
-}
-
-#[cfg(not(feature = "perf-literal"))]
-fn literal_analysis(_: &Hir) -> (literal::Seq, literal::Seq) {
- (literal::Seq::infinite(), literal::Seq::infinite())
-}
-
-#[cfg(feature = "perf-literal")]
-fn literal_analysis(expr: &Hir) -> (literal::Seq, literal::Seq) {
- const ATTEMPTS: [(usize, usize); 3] = [(5, 50), (4, 30), (3, 20)];
-
- let mut prefixes = literal::Extractor::new()
- .kind(literal::ExtractKind::Prefix)
- .extract(expr);
- for (keep, limit) in ATTEMPTS {
- let len = match prefixes.len() {
- None => break,
- Some(len) => len,
- };
- if len <= limit {
- break;
- }
- prefixes.keep_first_bytes(keep);
- prefixes.minimize_by_preference();
- }
-
- let mut suffixes = literal::Extractor::new()
- .kind(literal::ExtractKind::Suffix)
- .extract(expr);
- for (keep, limit) in ATTEMPTS {
- let len = match suffixes.len() {
- None => break,
- Some(len) => len,
- };
- if len <= limit {
- break;
- }
- suffixes.keep_last_bytes(keep);
- suffixes.minimize_by_preference();
- }
-
- (prefixes, suffixes)
-}
-
-#[cfg(test)]
-mod test {
- #[test]
- fn uppercut_s_backtracking_bytes_default_bytes_mismatch() {
- use crate::internal::ExecBuilder;
-
- let backtrack_bytes_re = ExecBuilder::new("^S")
- .bounded_backtracking()
- .only_utf8(false)
- .build()
- .map(|exec| exec.into_byte_regex())
- .map_err(|err| format!("{}", err))
- .unwrap();
-
- let default_bytes_re = ExecBuilder::new("^S")
- .only_utf8(false)
- .build()
- .map(|exec| exec.into_byte_regex())
- .map_err(|err| format!("{}", err))
- .unwrap();
-
- let input = vec![83, 83];
-
- let s1 = backtrack_bytes_re.split(&input);
- let s2 = default_bytes_re.split(&input);
- for (chunk1, chunk2) in s1.zip(s2) {
- assert_eq!(chunk1, chunk2);
- }
- }
-
- #[test]
- fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() {
- use crate::internal::ExecBuilder;
-
- let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)")
- .bounded_backtracking()
- .bytes(true)
- .build()
- .map(|exec| exec.into_regex())
- .map_err(|err| format!("{}", err))
- .unwrap();
-
- let default_bytes_re = ExecBuilder::new(r"^(?u:\*)")
- .bytes(true)
- .build()
- .map(|exec| exec.into_regex())
- .map_err(|err| format!("{}", err))
- .unwrap();
-
- let input = "**";
-
- let s1 = backtrack_bytes_re.split(input);
- let s2 = default_bytes_re.split(input);
- for (chunk1, chunk2) in s1.zip(s2) {
- assert_eq!(chunk1, chunk2);
- }
- }
-}