diff options
Diffstat (limited to 'third_party/rust/regex/src/exec.rs')
-rw-r--r-- | third_party/rust/regex/src/exec.rs | 1655 |
1 files changed, 1655 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/exec.rs b/third_party/rust/regex/src/exec.rs new file mode 100644 index 0000000000..e75ca083a0 --- /dev/null +++ b/third_party/rust/regex/src/exec.rs @@ -0,0 +1,1655 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::panic::AssertUnwindSafe; +use std::sync::Arc; + +#[cfg(feature = "perf-literal")] +use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind}; +use regex_syntax::hir::literal::Literals; +use regex_syntax::hir::Hir; +use regex_syntax::ParserBuilder; + +use crate::backtrack; +use crate::compile::Compiler; +#[cfg(feature = "perf-dfa")] +use crate::dfa; +use crate::error::Error; +use crate::input::{ByteInput, CharInput}; +use crate::literal::LiteralSearcher; +use crate::pikevm; +use crate::pool::{Pool, PoolGuard}; +use crate::prog::Program; +use crate::re_builder::RegexOptions; +use crate::re_bytes; +use crate::re_set; +use crate::re_trait::{Locations, RegularExpression, Slot}; +use crate::re_unicode; +use crate::utf8::next_utf8; + +/// `Exec` manages the execution of a regular expression. +/// +/// In particular, this manages the various compiled forms of a single regular +/// expression and the choice of which matching engine to use to execute a +/// regular expression. +#[derive(Debug)] +pub struct Exec { + /// All read only state. + ro: Arc<ExecReadOnly>, + /// A pool of reusable values for the various matching engines. + /// + /// Note that boxing this value is not strictly necessary, but it is an + /// easy way to ensure that T does not bloat the stack sized used by a pool + /// in the case where T is big. And this turns out to be the case at the + /// time of writing for regex's use of this pool. At the time of writing, + /// the size of a Regex on the stack is 856 bytes. Boxing this value + /// reduces that size to 16 bytes. + pool: Box<Pool<ProgramCache>>, +} + +/// `ExecNoSync` is like `Exec`, except it embeds a reference to a cache. This +/// means it is no longer Sync, but we can now avoid the overhead of +/// synchronization to fetch the cache. +#[derive(Debug)] +pub struct ExecNoSync<'c> { + /// All read only state. + ro: &'c Arc<ExecReadOnly>, + /// Caches for the various matching engines. + cache: PoolGuard<'c, ProgramCache>, +} + +/// `ExecNoSyncStr` is like `ExecNoSync`, but matches on &str instead of &[u8]. +#[derive(Debug)] +pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); + +/// `ExecReadOnly` comprises all read only state for a regex. Namely, all such +/// state is determined at compile time and never changes during search. +#[derive(Debug)] +struct ExecReadOnly { + /// The original regular expressions given by the caller to compile. + res: Vec<String>, + /// A compiled program that is used in the NFA simulation and backtracking. + /// It can be byte-based or Unicode codepoint based. + /// + /// N.B. It is not possibly to make this byte-based from the public API. + /// It is only used for testing byte based programs in the NFA simulations. + nfa: Program, + /// A compiled byte based program for DFA execution. This is only used + /// if a DFA can be executed. (Currently, only word boundary assertions are + /// not supported.) Note that this program contains an embedded `.*?` + /// preceding the first capture group, unless the regex is anchored at the + /// beginning. + dfa: Program, + /// The same as above, except the program is reversed (and there is no + /// preceding `.*?`). This is used by the DFA to find the starting location + /// of matches. + dfa_reverse: Program, + /// A set of suffix literals extracted from the regex. + /// + /// Prefix literals are stored on the `Program`, since they are used inside + /// the matching engines. + suffixes: LiteralSearcher, + /// An Aho-Corasick automaton with leftmost-first match semantics. + /// + /// This is only set when the entire regex is a simple unanchored + /// alternation of literals. We could probably use it more circumstances, + /// but this is already hacky enough in this architecture. + /// + /// N.B. We use u32 as a state ID representation under the assumption that + /// if we were to exhaust the ID space, we probably would have long + /// surpassed the compilation size limit. + #[cfg(feature = "perf-literal")] + ac: Option<AhoCorasick<u32>>, + /// match_type encodes as much upfront knowledge about how we're going to + /// execute a search as possible. + match_type: MatchType, +} + +/// Facilitates the construction of an executor by exposing various knobs +/// to control how a regex is executed and what kinds of resources it's +/// permitted to use. +// `ExecBuilder` is only public via the `internal` module, so avoid deriving +// `Debug`. +#[allow(missing_debug_implementations)] +pub struct ExecBuilder { + options: RegexOptions, + match_type: Option<MatchType>, + bytes: bool, + only_utf8: bool, +} + +/// Parsed represents a set of parsed regular expressions and their detected +/// literals. +struct Parsed { + exprs: Vec<Hir>, + prefixes: Literals, + suffixes: Literals, + bytes: bool, +} + +impl ExecBuilder { + /// Create a regex execution builder. + /// + /// This uses default settings for everything except the regex itself, + /// which must be provided. Further knobs can be set by calling methods, + /// and then finally, `build` to actually create the executor. + pub fn new(re: &str) -> Self { + Self::new_many(&[re]) + } + + /// Like new, but compiles the union of the given regular expressions. + /// + /// Note that when compiling 2 or more regular expressions, capture groups + /// are completely unsupported. (This means both `find` and `captures` + /// won't work.) + pub fn new_many<I, S>(res: I) -> Self + where + S: AsRef<str>, + I: IntoIterator<Item = S>, + { + let mut opts = RegexOptions::default(); + opts.pats = res.into_iter().map(|s| s.as_ref().to_owned()).collect(); + Self::new_options(opts) + } + + /// Create a regex execution builder. + pub fn new_options(opts: RegexOptions) -> Self { + ExecBuilder { + options: opts, + match_type: None, + bytes: false, + only_utf8: true, + } + } + + /// Set the matching engine to be automatically determined. + /// + /// This is the default state and will apply whatever optimizations are + /// possible, such as running a DFA. + /// + /// This overrides whatever was previously set via the `nfa` or + /// `bounded_backtracking` methods. + pub fn automatic(mut self) -> Self { + self.match_type = None; + self + } + + /// Sets the matching engine to use the NFA algorithm no matter what + /// optimizations are possible. + /// + /// This overrides whatever was previously set via the `automatic` or + /// `bounded_backtracking` methods. + pub fn nfa(mut self) -> Self { + self.match_type = Some(MatchType::Nfa(MatchNfaType::PikeVM)); + self + } + + /// Sets the matching engine to use a bounded backtracking engine no + /// matter what optimizations are possible. + /// + /// One must use this with care, since the bounded backtracking engine + /// uses memory proportion to `len(regex) * len(text)`. + /// + /// This overrides whatever was previously set via the `automatic` or + /// `nfa` methods. + pub fn bounded_backtracking(mut self) -> Self { + self.match_type = Some(MatchType::Nfa(MatchNfaType::Backtrack)); + self + } + + /// Compiles byte based programs for use with the NFA matching engines. + /// + /// By default, the NFA engines match on Unicode scalar values. They can + /// be made to use byte based programs instead. In general, the byte based + /// programs are slower because of a less efficient encoding of character + /// classes. + /// + /// Note that this does not impact DFA matching engines, which always + /// execute on bytes. + pub fn bytes(mut self, yes: bool) -> Self { + self.bytes = yes; + self + } + + /// When disabled, the program compiled may match arbitrary bytes. + /// + /// When enabled (the default), all compiled programs exclusively match + /// valid UTF-8 bytes. + pub fn only_utf8(mut self, yes: bool) -> Self { + self.only_utf8 = yes; + self + } + + /// Set the Unicode flag. + pub fn unicode(mut self, yes: bool) -> Self { + self.options.unicode = yes; + self + } + + /// Parse the current set of patterns into their AST and extract literals. + fn parse(&self) -> Result<Parsed, Error> { + let mut exprs = Vec::with_capacity(self.options.pats.len()); + let mut prefixes = Some(Literals::empty()); + let mut suffixes = Some(Literals::empty()); + let mut bytes = false; + let is_set = self.options.pats.len() > 1; + // If we're compiling a regex set and that set has any anchored + // expressions, then disable all literal optimizations. + for pat in &self.options.pats { + let mut parser = ParserBuilder::new() + .octal(self.options.octal) + .case_insensitive(self.options.case_insensitive) + .multi_line(self.options.multi_line) + .dot_matches_new_line(self.options.dot_matches_new_line) + .swap_greed(self.options.swap_greed) + .ignore_whitespace(self.options.ignore_whitespace) + .unicode(self.options.unicode) + .allow_invalid_utf8(!self.only_utf8) + .nest_limit(self.options.nest_limit) + .build(); + let expr = + parser.parse(pat).map_err(|e| Error::Syntax(e.to_string()))?; + bytes = bytes || !expr.is_always_utf8(); + + if cfg!(feature = "perf-literal") { + if !expr.is_anchored_start() && expr.is_any_anchored_start() { + // Partial anchors unfortunately make it hard to use + // prefixes, so disable them. + prefixes = None; + } else if is_set && expr.is_anchored_start() { + // Regex sets with anchors do not go well with literal + // optimizations. + prefixes = None; + } + prefixes = prefixes.and_then(|mut prefixes| { + if !prefixes.union_prefixes(&expr) { + None + } else { + Some(prefixes) + } + }); + + if !expr.is_anchored_end() && expr.is_any_anchored_end() { + // Partial anchors unfortunately make it hard to use + // suffixes, so disable them. + suffixes = None; + } else if is_set && expr.is_anchored_end() { + // Regex sets with anchors do not go well with literal + // optimizations. + suffixes = None; + } + suffixes = suffixes.and_then(|mut suffixes| { + if !suffixes.union_suffixes(&expr) { + None + } else { + Some(suffixes) + } + }); + } + exprs.push(expr); + } + Ok(Parsed { + exprs, + prefixes: prefixes.unwrap_or_else(Literals::empty), + suffixes: suffixes.unwrap_or_else(Literals::empty), + bytes, + }) + } + + /// Build an executor that can run a regular expression. + pub fn build(self) -> Result<Exec, Error> { + // Special case when we have no patterns to compile. + // This can happen when compiling a regex set. + if self.options.pats.is_empty() { + let ro = Arc::new(ExecReadOnly { + res: vec![], + nfa: Program::new(), + dfa: Program::new(), + dfa_reverse: Program::new(), + suffixes: LiteralSearcher::empty(), + #[cfg(feature = "perf-literal")] + ac: None, + match_type: MatchType::Nothing, + }); + let pool = ExecReadOnly::new_pool(&ro); + return Ok(Exec { ro, pool }); + } + let parsed = self.parse()?; + let mut nfa = Compiler::new() + .size_limit(self.options.size_limit) + .bytes(self.bytes || parsed.bytes) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + let mut dfa = Compiler::new() + .size_limit(self.options.size_limit) + .dfa(true) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + let mut dfa_reverse = Compiler::new() + .size_limit(self.options.size_limit) + .dfa(true) + .only_utf8(self.only_utf8) + .reverse(true) + .compile(&parsed.exprs)?; + + #[cfg(feature = "perf-literal")] + let ac = self.build_aho_corasick(&parsed); + nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); + dfa.prefixes = nfa.prefixes.clone(); + dfa.dfa_size_limit = self.options.dfa_size_limit; + dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; + + let mut ro = ExecReadOnly { + res: self.options.pats, + nfa, + dfa, + dfa_reverse, + suffixes: LiteralSearcher::suffixes(parsed.suffixes), + #[cfg(feature = "perf-literal")] + ac, + match_type: MatchType::Nothing, + }; + ro.match_type = ro.choose_match_type(self.match_type); + + let ro = Arc::new(ro); + let pool = ExecReadOnly::new_pool(&ro); + Ok(Exec { ro, pool }) + } + + #[cfg(feature = "perf-literal")] + fn build_aho_corasick(&self, parsed: &Parsed) -> Option<AhoCorasick<u32>> { + if parsed.exprs.len() != 1 { + return None; + } + let lits = match alternation_literals(&parsed.exprs[0]) { + None => return None, + Some(lits) => lits, + }; + // If we have a small number of literals, then let Teddy handle + // things (see literal/mod.rs). + if lits.len() <= 32 { + return None; + } + Some( + AhoCorasickBuilder::new() + .match_kind(MatchKind::LeftmostFirst) + .auto_configure(&lits) + .build_with_size::<u32, _, _>(&lits) + // This should never happen because we'd long exceed the + // compilation limit for regexes first. + .expect("AC automaton too big"), + ) + } +} + +impl<'c> RegularExpression for ExecNoSyncStr<'c> { + type Text = str; + + fn slots_len(&self) -> usize { + self.0.slots_len() + } + + fn next_after_empty(&self, text: &str, i: usize) -> usize { + next_utf8(text.as_bytes(), i) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_match_at(&self, text: &str, start: usize) -> Option<usize> { + self.0.shortest_match_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_at(&self, text: &str, start: usize) -> bool { + self.0.is_match_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { + self.0.find_at(text.as_bytes(), start) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn captures_read_at( + &self, + locs: &mut Locations, + text: &str, + start: usize, + ) -> Option<(usize, usize)> { + self.0.captures_read_at(locs, text.as_bytes(), start) + } +} + +impl<'c> RegularExpression for ExecNoSync<'c> { + type Text = [u8]; + + /// Returns the number of capture slots in the regular expression. (There + /// are two slots for every capture group, corresponding to possibly empty + /// start and end locations of the capture.) + fn slots_len(&self) -> usize { + self.ro.nfa.captures.len() * 2 + } + + fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { + i + 1 + } + + /// Returns the end of a match location, possibly occurring before the + /// end location of the correct leftmost-first match. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_match_at(&self, text: &[u8], start: usize) -> Option<usize> { + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).map(|(_, e)| e) + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa | MatchType::DfaMany => { + match self.shortest_dfa(text, start) { + dfa::Result::Match(end) => Some(end), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + true, + &text[start..], + text.len(), + ) { + dfa::Result::Match(_) => Some(text.len()), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.shortest_dfa_reverse_suffix(text, start) { + dfa::Result::Match(e) => Some(e), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.shortest_nfa(text, start), + } + } + MatchType::Nfa(ty) => self.shortest_nfa_type(ty, text, start), + MatchType::Nothing => None, + } + } + + /// Returns true if and only if the regex matches text. + /// + /// For single regular expressions, this is equivalent to calling + /// shortest_match(...).is_some(). + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_match_at(&self, text: &[u8], start: usize) -> bool { + if !self.is_anchor_end_match(text) { + return false; + } + // We need to do this dance because shortest_match relies on the NFA + // filling in captures[1], but a RegexSet has no captures. In other + // words, a RegexSet can't (currently) use shortest_match. ---AG + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).is_some() + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa | MatchType::DfaMany => { + match self.shortest_dfa(text, start) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + true, + &text[start..], + text.len(), + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.shortest_dfa_reverse_suffix(text, start) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.match_nfa(text, start), + } + } + MatchType::Nfa(ty) => self.match_nfa_type(ty, text, start), + MatchType::Nothing => false, + } + } + + /// Finds the start and end location of the leftmost-first match, starting + /// at the given location. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_at(&self, text: &[u8], start: usize) -> Option<(usize, usize)> { + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => self.find_literals(ty, text, start), + #[cfg(feature = "perf-dfa")] + MatchType::Dfa => match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + }, + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match self.find_dfa_anchored_reverse(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.find_dfa_reverse_suffix(text, start) { + dfa::Result::Match((s, e)) => Some((s, e)), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.find_nfa(MatchNfaType::Auto, text, start) + } + } + } + MatchType::Nfa(ty) => self.find_nfa(ty, text, start), + MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] + MatchType::DfaMany => { + unreachable!("BUG: RegexSet cannot be used with find") + } + } + } + + /// Finds the start and end location of the leftmost-first match and also + /// fills in all matching capture groups. + /// + /// The number of capture slots given should be equal to the total number + /// of capture slots in the compiled program. + /// + /// Note that the first two slots always correspond to the start and end + /// locations of the overall match. + fn captures_read_at( + &self, + locs: &mut Locations, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let slots = locs.as_slots(); + for slot in slots.iter_mut() { + *slot = None; + } + // If the caller unnecessarily uses this, then we try to save them + // from themselves. + match slots.len() { + 0 => return self.find_at(text, start), + 2 => { + return self.find_at(text, start).map(|(s, e)| { + slots[0] = Some(s); + slots[1] = Some(e); + (s, e) + }); + } + _ => {} // fallthrough + } + if !self.is_anchor_end_match(text) { + return None; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + MatchType::Literal(ty) => { + self.find_literals(ty, text, start).and_then(|(s, e)| { + self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ) + }) + } + #[cfg(feature = "perf-dfa")] + MatchType::Dfa => { + if self.ro.nfa.is_anchored_start { + self.captures_nfa(slots, text, start) + } else { + match self.find_dfa_forward(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => { + self.captures_nfa(slots, text, start) + } + } + } + } + #[cfg(feature = "perf-dfa")] + MatchType::DfaAnchoredReverse => { + match self.find_dfa_anchored_reverse(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + MatchType::DfaSuffix => { + match self.find_dfa_reverse_suffix(text, start) { + dfa::Result::Match((s, e)) => self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + s, + e, + ), + dfa::Result::NoMatch(_) => None, + dfa::Result::Quit => self.captures_nfa(slots, text, start), + } + } + MatchType::Nfa(ty) => { + self.captures_nfa_type(ty, slots, text, start, text.len()) + } + MatchType::Nothing => None, + #[cfg(feature = "perf-dfa")] + MatchType::DfaMany => { + unreachable!("BUG: RegexSet cannot be used with captures") + } + } + } +} + +impl<'c> ExecNoSync<'c> { + /// Finds the leftmost-first match using only literal search. + #[cfg(feature = "perf-literal")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_literals( + &self, + ty: MatchLiteralType, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + use self::MatchLiteralType::*; + match ty { + Unanchored => { + let lits = &self.ro.nfa.prefixes; + lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) + } + AnchoredStart => { + let lits = &self.ro.nfa.prefixes; + if start == 0 || !self.ro.nfa.is_anchored_start { + lits.find_start(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } else { + None + } + } + AnchoredEnd => { + let lits = &self.ro.suffixes; + lits.find_end(&text[start..]) + .map(|(s, e)| (start + s, start + e)) + } + AhoCorasick => self + .ro + .ac + .as_ref() + .unwrap() + .find(&text[start..]) + .map(|m| (start + m.start(), start + m.end())), + } + } + + /// Finds the leftmost-first match (start and end) using only the DFA. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_forward( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + let end = match dfa::Fsm::forward( + &self.ro.dfa, + self.cache.value(), + false, + text, + start, + ) { + NoMatch(i) => return NoMatch(i), + Quit => return Quit, + Match(end) if start == end => return Match((start, start)), + Match(end) => end, + }; + // Now run the DFA in reverse to find the start of the match. + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..], + end - start, + ) { + Match(s) => Match((start + s, end)), + NoMatch(i) => NoMatch(i), + Quit => Quit, + } + } + + /// Finds the leftmost-first match (start and end) using only the DFA, + /// but assumes the regex is anchored at the end and therefore starts at + /// the end of the regex and matches in reverse. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_anchored_reverse( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..], + text.len() - start, + ) { + Match(s) => Match((start + s, text.len())), + NoMatch(i) => NoMatch(i), + Quit => Quit, + } + } + + /// Finds the end of the shortest match using only the DFA. + #[cfg(feature = "perf-dfa")] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_dfa(&self, text: &[u8], start: usize) -> dfa::Result<usize> { + dfa::Fsm::forward(&self.ro.dfa, self.cache.value(), true, text, start) + } + + /// Finds the end of the shortest match using only the DFA by scanning for + /// suffix literals. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn shortest_dfa_reverse_suffix( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<usize> { + match self.exec_dfa_reverse_suffix(text, start) { + None => self.shortest_dfa(text, start), + Some(r) => r.map(|(_, end)| end), + } + } + + /// Finds the end of the shortest match using only the DFA by scanning for + /// suffix literals. It also reports the start of the match. + /// + /// Note that if None is returned, then the optimization gave up to avoid + /// worst case quadratic behavior. A forward scanning DFA should be tried + /// next. + /// + /// If a match is returned and the full leftmost-first match is desired, + /// then a forward scan starting from the beginning of the match must be + /// done. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn exec_dfa_reverse_suffix( + &self, + text: &[u8], + original_start: usize, + ) -> Option<dfa::Result<(usize, usize)>> { + use crate::dfa::Result::*; + + let lcs = self.ro.suffixes.lcs(); + debug_assert!(lcs.len() >= 1); + let mut start = original_start; + let mut end = start; + let mut last_literal = start; + while end <= text.len() { + last_literal += match lcs.find(&text[last_literal..]) { + None => return Some(NoMatch(text.len())), + Some(i) => i, + }; + end = last_literal + lcs.len(); + match dfa::Fsm::reverse( + &self.ro.dfa_reverse, + self.cache.value(), + false, + &text[start..end], + end - start, + ) { + Match(0) | NoMatch(0) => return None, + Match(i) => return Some(Match((start + i, end))), + NoMatch(i) => { + start += i; + last_literal += 1; + continue; + } + Quit => return Some(Quit), + }; + } + Some(NoMatch(text.len())) + } + + /// Finds the leftmost-first match (start and end) using only the DFA + /// by scanning for suffix literals. + /// + /// If the result returned indicates that the DFA quit, then another + /// matching engine should be used. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find_dfa_reverse_suffix( + &self, + text: &[u8], + start: usize, + ) -> dfa::Result<(usize, usize)> { + use crate::dfa::Result::*; + + let match_start = match self.exec_dfa_reverse_suffix(text, start) { + None => return self.find_dfa_forward(text, start), + Some(Match((start, _))) => start, + Some(r) => return r, + }; + // At this point, we've found a match. The only way to quit now + // without a match is if the DFA gives up (seems unlikely). + // + // Now run the DFA forwards to find the proper end of the match. + // (The suffix literal match can only indicate the earliest + // possible end location, which may appear before the end of the + // leftmost-first match.) + match dfa::Fsm::forward( + &self.ro.dfa, + self.cache.value(), + false, + text, + match_start, + ) { + NoMatch(_) => panic!("BUG: reverse match implies forward match"), + Quit => Quit, + Match(e) => Match((match_start, e)), + } + } + + /// Executes the NFA engine to return whether there is a match or not. + /// + /// Ideally, we could use shortest_nfa(...).is_some() and get the same + /// performance characteristics, but regex sets don't have captures, which + /// shortest_nfa depends on. + #[cfg(feature = "perf-dfa")] + fn match_nfa(&self, text: &[u8], start: usize) -> bool { + self.match_nfa_type(MatchNfaType::Auto, text, start) + } + + /// Like match_nfa, but allows specification of the type of NFA engine. + fn match_nfa_type( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> bool { + self.exec_nfa( + ty, + &mut [false], + &mut [], + true, + false, + text, + start, + text.len(), + ) + } + + /// Finds the shortest match using an NFA. + #[cfg(feature = "perf-dfa")] + fn shortest_nfa(&self, text: &[u8], start: usize) -> Option<usize> { + self.shortest_nfa_type(MatchNfaType::Auto, text, start) + } + + /// Like shortest_nfa, but allows specification of the type of NFA engine. + fn shortest_nfa_type( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> Option<usize> { + let mut slots = [None, None]; + if self.exec_nfa( + ty, + &mut [false], + &mut slots, + true, + true, + text, + start, + text.len(), + ) { + slots[1] + } else { + None + } + } + + /// Like find, but executes an NFA engine. + fn find_nfa( + &self, + ty: MatchNfaType, + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + let mut slots = [None, None]; + if self.exec_nfa( + ty, + &mut [false], + &mut slots, + false, + false, + text, + start, + text.len(), + ) { + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } + } else { + None + } + } + + /// Like find_nfa, but fills in captures. + /// + /// `slots` should have length equal to `2 * nfa.captures.len()`. + #[cfg(feature = "perf-dfa")] + fn captures_nfa( + &self, + slots: &mut [Slot], + text: &[u8], + start: usize, + ) -> Option<(usize, usize)> { + self.captures_nfa_type( + MatchNfaType::Auto, + slots, + text, + start, + text.len(), + ) + } + + /// Like captures_nfa, but allows specification of type of NFA engine. + fn captures_nfa_type( + &self, + ty: MatchNfaType, + slots: &mut [Slot], + text: &[u8], + start: usize, + end: usize, + ) -> Option<(usize, usize)> { + if self.exec_nfa( + ty, + &mut [false], + slots, + false, + false, + text, + start, + end, + ) { + match (slots[0], slots[1]) { + (Some(s), Some(e)) => Some((s, e)), + _ => None, + } + } else { + None + } + } + + fn exec_nfa( + &self, + mut ty: MatchNfaType, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + quit_after_match_with_pos: bool, + text: &[u8], + start: usize, + end: usize, + ) -> bool { + use self::MatchNfaType::*; + if let Auto = ty { + if backtrack::should_exec(self.ro.nfa.len(), text.len()) { + ty = Backtrack; + } else { + ty = PikeVM; + } + } + // The backtracker can't return the shortest match position as it is + // implemented today. So if someone calls `shortest_match` and we need + // to run an NFA, then use the PikeVM. + if quit_after_match_with_pos || ty == PikeVM { + self.exec_pikevm( + matches, + slots, + quit_after_match, + text, + start, + end, + ) + } else { + self.exec_backtrack(matches, slots, text, start, end) + } + } + + /// Always run the NFA algorithm. + fn exec_pikevm( + &self, + matches: &mut [bool], + slots: &mut [Slot], + quit_after_match: bool, + text: &[u8], + start: usize, + end: usize, + ) -> bool { + if self.ro.nfa.uses_bytes() { + pikevm::Fsm::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + quit_after_match, + ByteInput::new(text, self.ro.nfa.only_utf8), + start, + end, + ) + } else { + pikevm::Fsm::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + quit_after_match, + CharInput::new(text), + start, + end, + ) + } + } + + /// Always runs the NFA using bounded backtracking. + fn exec_backtrack( + &self, + matches: &mut [bool], + slots: &mut [Slot], + text: &[u8], + start: usize, + end: usize, + ) -> bool { + if self.ro.nfa.uses_bytes() { + backtrack::Bounded::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + ByteInput::new(text, self.ro.nfa.only_utf8), + start, + end, + ) + } else { + backtrack::Bounded::exec( + &self.ro.nfa, + self.cache.value(), + matches, + slots, + CharInput::new(text), + start, + end, + ) + } + } + + /// Finds which regular expressions match the given text. + /// + /// `matches` should have length equal to the number of regexes being + /// searched. + /// + /// This is only useful when one wants to know which regexes in a set + /// match some text. + pub fn many_matches_at( + &self, + matches: &mut [bool], + text: &[u8], + start: usize, + ) -> bool { + use self::MatchType::*; + if !self.is_anchor_end_match(text) { + return false; + } + match self.ro.match_type { + #[cfg(feature = "perf-literal")] + Literal(ty) => { + debug_assert_eq!(matches.len(), 1); + matches[0] = self.find_literals(ty, text, start).is_some(); + matches[0] + } + #[cfg(feature = "perf-dfa")] + Dfa | DfaAnchoredReverse | DfaMany => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + } + } + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix => { + match dfa::Fsm::forward_many( + &self.ro.dfa, + self.cache.value(), + matches, + text, + start, + ) { + dfa::Result::Match(_) => true, + dfa::Result::NoMatch(_) => false, + dfa::Result::Quit => self.exec_nfa( + MatchNfaType::Auto, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + } + } + Nfa(ty) => self.exec_nfa( + ty, + matches, + &mut [], + false, + false, + text, + start, + text.len(), + ), + Nothing => false, + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + fn is_anchor_end_match(&self, text: &[u8]) -> bool { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly, _: &[u8]) -> bool { + true + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { + // Only do this check if the haystack is big (>1MB). + if text.len() > (1 << 20) && ro.nfa.is_anchored_end { + let lcs = ro.suffixes.lcs(); + if lcs.len() >= 1 && !lcs.is_suffix(text) { + return false; + } + } + true + } + + imp(&self.ro, text) + } + + pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { + &self.ro.nfa.capture_name_idx + } +} + +impl<'c> ExecNoSyncStr<'c> { + pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { + self.0.capture_name_idx() + } +} + +impl Exec { + /// Get a searcher that isn't Sync. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn searcher(&self) -> ExecNoSync<'_> { + ExecNoSync { + ro: &self.ro, // a clone is too expensive here! (and not needed) + cache: self.pool.get(), + } + } + + /// Get a searcher that isn't Sync and can match on &str. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn searcher_str(&self) -> ExecNoSyncStr<'_> { + ExecNoSyncStr(self.searcher()) + } + + /// Build a Regex from this executor. + pub fn into_regex(self) -> re_unicode::Regex { + re_unicode::Regex::from(self) + } + + /// Build a RegexSet from this executor. + pub fn into_regex_set(self) -> re_set::unicode::RegexSet { + re_set::unicode::RegexSet::from(self) + } + + /// Build a Regex from this executor that can match arbitrary bytes. + pub fn into_byte_regex(self) -> re_bytes::Regex { + re_bytes::Regex::from(self) + } + + /// Build a RegexSet from this executor that can match arbitrary bytes. + pub fn into_byte_regex_set(self) -> re_set::bytes::RegexSet { + re_set::bytes::RegexSet::from(self) + } + + /// The original regular expressions given by the caller that were + /// compiled. + pub fn regex_strings(&self) -> &[String] { + &self.ro.res + } + + /// Return a slice of capture names. + /// + /// Any capture that isn't named is None. + pub fn capture_names(&self) -> &[Option<String>] { + &self.ro.nfa.captures + } + + /// Return a reference to named groups mapping (from group name to + /// group position). + pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> { + &self.ro.nfa.capture_name_idx + } +} + +impl Clone for Exec { + fn clone(&self) -> Exec { + let pool = ExecReadOnly::new_pool(&self.ro); + Exec { ro: self.ro.clone(), pool } + } +} + +impl ExecReadOnly { + fn choose_match_type(&self, hint: Option<MatchType>) -> MatchType { + if let Some(MatchType::Nfa(_)) = hint { + return hint.unwrap(); + } + // If the NFA is empty, then we'll never match anything. + if self.nfa.insts.is_empty() { + return MatchType::Nothing; + } + if let Some(literalty) = self.choose_literal_match_type() { + return literalty; + } + if let Some(dfaty) = self.choose_dfa_match_type() { + return dfaty; + } + // We're so totally hosed. + MatchType::Nfa(MatchNfaType::Auto) + } + + /// If a plain literal scan can be used, then a corresponding literal + /// search type is returned. + fn choose_literal_match_type(&self) -> Option<MatchType> { + #[cfg(not(feature = "perf-literal"))] + fn imp(_: &ExecReadOnly) -> Option<MatchType> { + None + } + + #[cfg(feature = "perf-literal")] + fn imp(ro: &ExecReadOnly) -> Option<MatchType> { + // If our set of prefixes is complete, then we can use it to find + // a match in lieu of a regex engine. This doesn't quite work well + // in the presence of multiple regexes, so only do it when there's + // one. + // + // TODO(burntsushi): Also, don't try to match literals if the regex + // is partially anchored. We could technically do it, but we'd need + // to create two sets of literals: all of them and then the subset + // that aren't anchored. We would then only search for all of them + // when at the beginning of the input and use the subset in all + // other cases. + if ro.res.len() != 1 { + return None; + } + if ro.ac.is_some() { + return Some(MatchType::Literal( + MatchLiteralType::AhoCorasick, + )); + } + if ro.nfa.prefixes.complete() { + return if ro.nfa.is_anchored_start { + Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) + } else { + Some(MatchType::Literal(MatchLiteralType::Unanchored)) + }; + } + if ro.suffixes.complete() { + return if ro.nfa.is_anchored_end { + Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) + } else { + // This case shouldn't happen. When the regex isn't + // anchored, then complete prefixes should imply complete + // suffixes. + Some(MatchType::Literal(MatchLiteralType::Unanchored)) + }; + } + None + } + + imp(self) + } + + /// If a DFA scan can be used, then choose the appropriate DFA strategy. + fn choose_dfa_match_type(&self) -> Option<MatchType> { + #[cfg(not(feature = "perf-dfa"))] + fn imp(_: &ExecReadOnly) -> Option<MatchType> { + None + } + + #[cfg(feature = "perf-dfa")] + fn imp(ro: &ExecReadOnly) -> Option<MatchType> { + if !dfa::can_exec(&ro.dfa) { + return None; + } + // Regex sets require a slightly specialized path. + if ro.res.len() >= 2 { + return Some(MatchType::DfaMany); + } + // If the regex is anchored at the end but not the start, then + // just match in reverse from the end of the haystack. + if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { + return Some(MatchType::DfaAnchoredReverse); + } + #[cfg(feature = "perf-literal")] + { + // If there's a longish suffix literal, then it might be faster + // to look for that first. + if ro.should_suffix_scan() { + return Some(MatchType::DfaSuffix); + } + } + // Fall back to your garden variety forward searching lazy DFA. + Some(MatchType::Dfa) + } + + imp(self) + } + + /// Returns true if the program is amenable to suffix scanning. + /// + /// When this is true, as a heuristic, we assume it is OK to quickly scan + /// for suffix literals and then do a *reverse* DFA match from any matches + /// produced by the literal scan. (And then followed by a forward DFA + /// search, since the previously found suffix literal maybe not actually be + /// the end of a match.) + /// + /// This is a bit of a specialized optimization, but can result in pretty + /// big performance wins if 1) there are no prefix literals and 2) the + /// suffix literals are pretty rare in the text. (1) is obviously easy to + /// account for but (2) is harder. As a proxy, we assume that longer + /// strings are generally rarer, so we only enable this optimization when + /// we have a meaty suffix. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + fn should_suffix_scan(&self) -> bool { + if self.suffixes.is_empty() { + return false; + } + let lcs_len = self.suffixes.lcs().char_len(); + lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len() + } + + fn new_pool(ro: &Arc<ExecReadOnly>) -> Box<Pool<ProgramCache>> { + let ro = ro.clone(); + Box::new(Pool::new(Box::new(move || { + AssertUnwindSafe(RefCell::new(ProgramCacheInner::new(&ro))) + }))) + } +} + +#[derive(Clone, Copy, Debug)] +enum MatchType { + /// A single or multiple literal search. This is only used when the regex + /// can be decomposed into a literal search. + #[cfg(feature = "perf-literal")] + Literal(MatchLiteralType), + /// A normal DFA search. + #[cfg(feature = "perf-dfa")] + Dfa, + /// A reverse DFA search starting from the end of a haystack. + #[cfg(feature = "perf-dfa")] + DfaAnchoredReverse, + /// A reverse DFA search with suffix literal scanning. + #[cfg(all(feature = "perf-dfa", feature = "perf-literal"))] + DfaSuffix, + /// Use the DFA on two or more regular expressions. + #[cfg(feature = "perf-dfa")] + DfaMany, + /// An NFA variant. + Nfa(MatchNfaType), + /// No match is ever possible, so don't ever try to search. + Nothing, +} + +#[derive(Clone, Copy, Debug)] +#[cfg(feature = "perf-literal")] +enum MatchLiteralType { + /// Match literals anywhere in text. + Unanchored, + /// Match literals only at the start of text. + AnchoredStart, + /// Match literals only at the end of text. + AnchoredEnd, + /// Use an Aho-Corasick automaton. This requires `ac` to be Some on + /// ExecReadOnly. + AhoCorasick, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum MatchNfaType { + /// Choose between Backtrack and PikeVM. + Auto, + /// NFA bounded backtracking. + /// + /// (This is only set by tests, since it never makes sense to always want + /// backtracking.) + Backtrack, + /// The Pike VM. + /// + /// (This is only set by tests, since it never makes sense to always want + /// the Pike VM.) + PikeVM, +} + +/// `ProgramCache` maintains reusable allocations for each matching engine +/// available to a particular program. +/// +/// We declare this as unwind safe since it's a cache that's only used for +/// performance purposes. If a panic occurs, it is (or should be) always safe +/// to continue using the same regex object. +pub type ProgramCache = AssertUnwindSafe<RefCell<ProgramCacheInner>>; + +#[derive(Debug)] +pub struct ProgramCacheInner { + pub pikevm: pikevm::Cache, + pub backtrack: backtrack::Cache, + #[cfg(feature = "perf-dfa")] + pub dfa: dfa::Cache, + #[cfg(feature = "perf-dfa")] + pub dfa_reverse: dfa::Cache, +} + +impl ProgramCacheInner { + fn new(ro: &ExecReadOnly) -> Self { + ProgramCacheInner { + pikevm: pikevm::Cache::new(&ro.nfa), + backtrack: backtrack::Cache::new(&ro.nfa), + #[cfg(feature = "perf-dfa")] + dfa: dfa::Cache::new(&ro.dfa), + #[cfg(feature = "perf-dfa")] + dfa_reverse: dfa::Cache::new(&ro.dfa_reverse), + } + } +} + +/// Alternation literals checks if the given HIR is a simple alternation of +/// literals, and if so, returns them. Otherwise, this returns None. +#[cfg(feature = "perf-literal")] +fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> { + use regex_syntax::hir::{HirKind, Literal}; + + // This is pretty hacky, but basically, if `is_alternation_literal` is + // true, then we can make several assumptions about the structure of our + // HIR. This is what justifies the `unreachable!` statements below. + // + // This code should be refactored once we overhaul this crate's + // optimization pipeline, because this is a terribly inflexible way to go + // about things. + + if !expr.is_alternation_literal() { + return None; + } + let alts = match *expr.kind() { + HirKind::Alternation(ref alts) => alts, + _ => return None, // one literal isn't worth it + }; + + let extendlit = |lit: &Literal, dst: &mut Vec<u8>| match *lit { + Literal::Unicode(c) => { + let mut buf = [0; 4]; + dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes()); + } + Literal::Byte(b) => { + dst.push(b); + } + }; + + let mut lits = vec![]; + for alt in alts { + let mut lit = vec![]; + match *alt.kind() { + HirKind::Literal(ref x) => extendlit(x, &mut lit), + HirKind::Concat(ref exprs) => { + for e in exprs { + match *e.kind() { + HirKind::Literal(ref x) => extendlit(x, &mut lit), + _ => unreachable!("expected literal, got {:?}", e), + } + } + } + _ => unreachable!("expected literal or concat, got {:?}", alt), + } + lits.push(lit); + } + Some(lits) +} + +#[cfg(test)] +mod test { + #[test] + fn uppercut_s_backtracking_bytes_default_bytes_mismatch() { + use crate::internal::ExecBuilder; + + let backtrack_bytes_re = ExecBuilder::new("^S") + .bounded_backtracking() + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_bytes_re = ExecBuilder::new("^S") + .only_utf8(false) + .build() + .map(|exec| exec.into_byte_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = vec![83, 83]; + + let s1 = backtrack_bytes_re.split(&input); + let s2 = default_bytes_re.split(&input); + for (chunk1, chunk2) in s1.zip(s2) { + assert_eq!(chunk1, chunk2); + } + } + + #[test] + fn unicode_lit_star_backtracking_utf8bytes_default_utf8bytes_mismatch() { + use crate::internal::ExecBuilder; + + let backtrack_bytes_re = ExecBuilder::new(r"^(?u:\*)") + .bounded_backtracking() + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let default_bytes_re = ExecBuilder::new(r"^(?u:\*)") + .bytes(true) + .build() + .map(|exec| exec.into_regex()) + .map_err(|err| format!("{}", err)) + .unwrap(); + + let input = "**"; + + let s1 = backtrack_bytes_re.split(input); + let s2 = default_bytes_re.split(input); + for (chunk1, chunk2) in s1.zip(s2) { + assert_eq!(chunk1, chunk2); + } + } +} |