diff options
Diffstat (limited to 'third_party/rust/regex/src/literal')
-rw-r--r-- | third_party/rust/regex/src/literal/imp.rs | 402 | ||||
-rw-r--r-- | third_party/rust/regex/src/literal/mod.rs | 55 |
2 files changed, 457 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/literal/imp.rs b/third_party/rust/regex/src/literal/imp.rs new file mode 100644 index 0000000000..90b2f11606 --- /dev/null +++ b/third_party/rust/regex/src/literal/imp.rs @@ -0,0 +1,402 @@ +use std::mem; + +use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder}; +use memchr::{memchr, memchr2, memchr3, memmem}; +use regex_syntax::hir::literal::{Literal, Literals}; + +/// A prefix extracted from a compiled regular expression. +/// +/// A regex prefix is a set of literal strings that *must* be matched at the +/// beginning of a regex in order for the entire regex to match. Similarly +/// for a regex suffix. +#[derive(Clone, Debug)] +pub struct LiteralSearcher { + complete: bool, + lcp: Memmem, + lcs: Memmem, + matcher: Matcher, +} + +#[derive(Clone, Debug)] +enum Matcher { + /// No literals. (Never advances through the input.) + Empty, + /// A set of four or more single byte literals. + Bytes(SingleByteSet), + /// A single substring, using vector accelerated routines when available. + Memmem(Memmem), + /// An Aho-Corasick automaton. + AC { ac: AhoCorasick<u32>, lits: Vec<Literal> }, + /// A packed multiple substring searcher, using SIMD. + /// + /// Note that Aho-Corasick will actually use this packed searcher + /// internally automatically, however, there is some overhead associated + /// with going through the Aho-Corasick machinery. So using the packed + /// searcher directly results in some gains. + Packed { s: packed::Searcher, lits: Vec<Literal> }, +} + +impl LiteralSearcher { + /// Returns a matcher that never matches and never advances the input. + pub fn empty() -> Self { + Self::new(Literals::empty(), Matcher::Empty) + } + + /// Returns a matcher for literal prefixes from the given set. + pub fn prefixes(lits: Literals) -> Self { + let matcher = Matcher::prefixes(&lits); + Self::new(lits, matcher) + } + + /// Returns a matcher for literal suffixes from the given set. + pub fn suffixes(lits: Literals) -> Self { + let matcher = Matcher::suffixes(&lits); + Self::new(lits, matcher) + } + + fn new(lits: Literals, matcher: Matcher) -> Self { + let complete = lits.all_complete(); + LiteralSearcher { + complete, + lcp: Memmem::new(lits.longest_common_prefix()), + lcs: Memmem::new(lits.longest_common_suffix()), + matcher, + } + } + + /// Returns true if all matches comprise the entire regular expression. + /// + /// This does not necessarily mean that a literal match implies a match + /// of the regular expression. For example, the regular expression `^a` + /// is comprised of a single complete literal `a`, but the regular + /// expression demands that it only match at the beginning of a string. + pub fn complete(&self) -> bool { + self.complete && !self.is_empty() + } + + /// Find the position of a literal in `haystack` if it exists. + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { + use self::Matcher::*; + match self.matcher { + Empty => Some((0, 0)), + Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), + Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), + AC { ref ac, .. } => { + ac.find(haystack).map(|m| (m.start(), m.end())) + } + Packed { ref s, .. } => { + s.find(haystack).map(|m| (m.start(), m.end())) + } + } + } + + /// Like find, except matches must start at index `0`. + pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[0..lit.len()] { + return Some((0, lit.len())); + } + } + None + } + + /// Like find, except matches must end at index `haystack.len()`. + pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { + for lit in self.iter() { + if lit.len() > haystack.len() { + continue; + } + if lit == &haystack[haystack.len() - lit.len()..] { + return Some((haystack.len() - lit.len(), haystack.len())); + } + } + None + } + + /// Returns an iterator over all literals to be matched. + pub fn iter(&self) -> LiteralIter<'_> { + match self.matcher { + Matcher::Empty => LiteralIter::Empty, + Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), + Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), + Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), + Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), + } + } + + /// Returns a matcher for the longest common prefix of this matcher. + pub fn lcp(&self) -> &Memmem { + &self.lcp + } + + /// Returns a matcher for the longest common suffix of this matcher. + pub fn lcs(&self) -> &Memmem { + &self.lcs + } + + /// Returns true iff this prefix is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of prefixes in this machine. + pub fn len(&self) -> usize { + use self::Matcher::*; + match self.matcher { + Empty => 0, + Bytes(ref sset) => sset.dense.len(), + Memmem(_) => 1, + AC { ref ac, .. } => ac.pattern_count(), + Packed { ref lits, .. } => lits.len(), + } + } + + /// Return the approximate heap usage of literals in bytes. + pub fn approximate_size(&self) -> usize { + use self::Matcher::*; + match self.matcher { + Empty => 0, + Bytes(ref sset) => sset.approximate_size(), + Memmem(ref single) => single.approximate_size(), + AC { ref ac, .. } => ac.heap_bytes(), + Packed { ref s, .. } => s.heap_bytes(), + } + } +} + +impl Matcher { + fn prefixes(lits: &Literals) -> Self { + let sset = SingleByteSet::prefixes(lits); + Matcher::new(lits, sset) + } + + fn suffixes(lits: &Literals) -> Self { + let sset = SingleByteSet::suffixes(lits); + Matcher::new(lits, sset) + } + + fn new(lits: &Literals, sset: SingleByteSet) -> Self { + if lits.literals().is_empty() { + return Matcher::Empty; + } + if sset.dense.len() >= 26 { + // Avoid trying to match a large number of single bytes. + // This is *very* sensitive to a frequency analysis comparison + // between the bytes in sset and the composition of the haystack. + // No matter the size of sset, if its members all are rare in the + // haystack, then it'd be worth using it. How to tune this... IDK. + // ---AG + return Matcher::Empty; + } + if sset.complete { + return Matcher::Bytes(sset); + } + if lits.literals().len() == 1 { + return Matcher::Memmem(Memmem::new(&lits.literals()[0])); + } + + let pats = lits.literals().to_owned(); + let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; + if lits.literals().len() <= 100 && !is_aho_corasick_fast { + let mut builder = packed::Config::new() + .match_kind(packed::MatchKind::LeftmostFirst) + .builder(); + if let Some(s) = builder.extend(&pats).build() { + return Matcher::Packed { s, lits: pats }; + } + } + let ac = AhoCorasickBuilder::new() + .match_kind(aho_corasick::MatchKind::LeftmostFirst) + .dfa(true) + .build_with_size::<u32, _, _>(&pats) + .unwrap(); + Matcher::AC { ac, lits: pats } + } +} + +#[derive(Debug)] +pub enum LiteralIter<'a> { + Empty, + Bytes(&'a [u8]), + Single(&'a [u8]), + AC(&'a [Literal]), + Packed(&'a [Literal]), +} + +impl<'a> Iterator for LiteralIter<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option<Self::Item> { + match *self { + LiteralIter::Empty => None, + LiteralIter::Bytes(ref mut many) => { + if many.is_empty() { + None + } else { + let next = &many[0..1]; + *many = &many[1..]; + Some(next) + } + } + LiteralIter::Single(ref mut one) => { + if one.is_empty() { + None + } else { + let next = &one[..]; + *one = &[]; + Some(next) + } + } + LiteralIter::AC(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(&**next) + } + } + LiteralIter::Packed(ref mut lits) => { + if lits.is_empty() { + None + } else { + let next = &lits[0]; + *lits = &lits[1..]; + Some(&**next) + } + } + } + } +} + +#[derive(Clone, Debug)] +struct SingleByteSet { + sparse: Vec<bool>, + dense: Vec<u8>, + complete: bool, + all_ascii: bool, +} + +impl SingleByteSet { + fn new() -> SingleByteSet { + SingleByteSet { + sparse: vec![false; 256], + dense: vec![], + complete: true, + all_ascii: true, + } + } + + fn prefixes(lits: &Literals) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + for lit in lits.literals() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.get(0) { + if !sset.sparse[b as usize] { + if b > 0x7F { + sset.all_ascii = false; + } + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } + } + sset + } + + fn suffixes(lits: &Literals) -> SingleByteSet { + let mut sset = SingleByteSet::new(); + for lit in lits.literals() { + sset.complete = sset.complete && lit.len() == 1; + if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) { + if !sset.sparse[b as usize] { + if b > 0x7F { + sset.all_ascii = false; + } + sset.dense.push(b); + sset.sparse[b as usize] = true; + } + } + } + sset + } + + /// Faster find that special cases certain sizes to use memchr. + #[cfg_attr(feature = "perf-inline", inline(always))] + fn find(&self, text: &[u8]) -> Option<usize> { + match self.dense.len() { + 0 => None, + 1 => memchr(self.dense[0], text), + 2 => memchr2(self.dense[0], self.dense[1], text), + 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), + _ => self._find(text), + } + } + + /// Generic find that works on any sized set. + fn _find(&self, haystack: &[u8]) -> Option<usize> { + for (i, &b) in haystack.iter().enumerate() { + if self.sparse[b as usize] { + return Some(i); + } + } + None + } + + fn approximate_size(&self) -> usize { + (self.dense.len() * mem::size_of::<u8>()) + + (self.sparse.len() * mem::size_of::<bool>()) + } +} + +/// A simple wrapper around the memchr crate's memmem implementation. +/// +/// The API this exposes mirrors the API of previous substring searchers that +/// this supplanted. +#[derive(Clone, Debug)] +pub struct Memmem { + finder: memmem::Finder<'static>, + char_len: usize, +} + +impl Memmem { + fn new(pat: &[u8]) -> Memmem { + Memmem { + finder: memmem::Finder::new(pat).into_owned(), + char_len: char_len_lossy(pat), + } + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn find(&self, haystack: &[u8]) -> Option<usize> { + self.finder.find(haystack) + } + + #[cfg_attr(feature = "perf-inline", inline(always))] + pub fn is_suffix(&self, text: &[u8]) -> bool { + if text.len() < self.len() { + return false; + } + &text[text.len() - self.len()..] == self.finder.needle() + } + + pub fn len(&self) -> usize { + self.finder.needle().len() + } + + pub fn char_len(&self) -> usize { + self.char_len + } + + fn approximate_size(&self) -> usize { + self.finder.needle().len() * mem::size_of::<u8>() + } +} + +fn char_len_lossy(bytes: &[u8]) -> usize { + String::from_utf8_lossy(bytes).chars().count() +} diff --git a/third_party/rust/regex/src/literal/mod.rs b/third_party/rust/regex/src/literal/mod.rs new file mode 100644 index 0000000000..980f523309 --- /dev/null +++ b/third_party/rust/regex/src/literal/mod.rs @@ -0,0 +1,55 @@ +pub use self::imp::*; + +#[cfg(feature = "perf-literal")] +mod imp; + +#[allow(missing_docs)] +#[cfg(not(feature = "perf-literal"))] +mod imp { + use regex_syntax::hir::literal::Literals; + + #[derive(Clone, Debug)] + pub struct LiteralSearcher(()); + + impl LiteralSearcher { + pub fn empty() -> Self { + LiteralSearcher(()) + } + + pub fn prefixes(_: Literals) -> Self { + LiteralSearcher(()) + } + + pub fn suffixes(_: Literals) -> Self { + LiteralSearcher(()) + } + + pub fn complete(&self) -> bool { + false + } + + pub fn find(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_start(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn find_end(&self, _: &[u8]) -> Option<(usize, usize)> { + unreachable!() + } + + pub fn is_empty(&self) -> bool { + true + } + + pub fn len(&self) -> usize { + 0 + } + + pub fn approximate_size(&self) -> usize { + 0 + } + } +} |