diff options
Diffstat (limited to 'third_party/rust/regex-automata/src/meta/literal.rs')
-rw-r--r-- | third_party/rust/regex-automata/src/meta/literal.rs | 81 |
1 files changed, 81 insertions, 0 deletions
diff --git a/third_party/rust/regex-automata/src/meta/literal.rs b/third_party/rust/regex-automata/src/meta/literal.rs new file mode 100644 index 0000000000..a68b93b7a6 --- /dev/null +++ b/third_party/rust/regex-automata/src/meta/literal.rs @@ -0,0 +1,81 @@ +use alloc::{vec, vec::Vec}; + +use regex_syntax::hir::Hir; + +use crate::{meta::regex::RegexInfo, util::search::MatchKind}; + +/// Pull out an alternation of literals from the given sequence of HIR +/// expressions. +/// +/// There are numerous ways for this to fail. Generally, this only applies +/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there +/// are "too few" alternates, in which case, the regex engine is likely faster. +/// +/// And currently, this only returns something when 'hirs.len() == 1'. +pub(crate) fn alternation_literals( + info: &RegexInfo, + hirs: &[&Hir], +) -> Option<Vec<Vec<u8>>> { + use regex_syntax::hir::{HirKind, Literal}; + + // Might as well skip the work below if we know we can't build an + // Aho-Corasick searcher. + if !cfg!(feature = "perf-literal-multisubstring") { + return None; + } + // This is pretty hacky, but basically, if `is_alternation_literal` is + // true, then we can make several assumptions about the structure of our + // HIR. This is what justifies the `unreachable!` statements below. + if hirs.len() != 1 + || !info.props()[0].look_set().is_empty() + || info.props()[0].explicit_captures_len() > 0 + || !info.props()[0].is_alternation_literal() + || info.config().get_match_kind() != MatchKind::LeftmostFirst + { + return None; + } + let hir = &hirs[0]; + let alts = match *hir.kind() { + HirKind::Alternation(ref alts) => alts, + _ => return None, // one literal isn't worth it + }; + + let mut lits = vec![]; + for alt in alts { + let mut lit = vec![]; + match *alt.kind() { + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes) + } + HirKind::Concat(ref exprs) => { + for e in exprs { + match *e.kind() { + HirKind::Literal(Literal(ref bytes)) => { + lit.extend_from_slice(bytes); + } + _ => unreachable!("expected literal, got {:?}", e), + } + } + } + _ => unreachable!("expected literal or concat, got {:?}", alt), + } + lits.push(lit); + } + // Why do this? Well, when the number of literals is small, it's likely + // that we'll use the lazy DFA which is in turn likely to be faster than + // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have + // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use + // the latter because it is so hungry (in time and space), and the former + // is decently fast, but not as fast as a well oiled lazy DFA. + // + // However, once the number starts getting large, the lazy DFA is likely + // to start thrashing because of the modest default cache size. When + // exactly does this happen? Dunno. But at whatever point that is (we make + // a guess below based on ad hoc benchmarking), we'll want to cut over to + // Aho-Corasick, where even the contiguous NFA is likely to do much better. + if lits.len() < 3000 { + debug!("skipping Aho-Corasick because there are too few literals"); + return None; + } + Some(lits) +} |