summaryrefslogtreecommitdiffstats
path: root/third_party/rust/regex/src/literal/imp.rs
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/rust/regex/src/literal/imp.rs')
-rw-r--r--third_party/rust/regex/src/literal/imp.rs402
1 files changed, 402 insertions, 0 deletions
diff --git a/third_party/rust/regex/src/literal/imp.rs b/third_party/rust/regex/src/literal/imp.rs
new file mode 100644
index 0000000000..90b2f11606
--- /dev/null
+++ b/third_party/rust/regex/src/literal/imp.rs
@@ -0,0 +1,402 @@
+use std::mem;
+
+use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
+use memchr::{memchr, memchr2, memchr3, memmem};
+use regex_syntax::hir::literal::{Literal, Literals};
+
+/// A prefix extracted from a compiled regular expression.
+///
+/// A regex prefix is a set of literal strings that *must* be matched at the
+/// beginning of a regex in order for the entire regex to match. Similarly
+/// for a regex suffix.
+#[derive(Clone, Debug)]
+pub struct LiteralSearcher {
+ complete: bool,
+ lcp: Memmem,
+ lcs: Memmem,
+ matcher: Matcher,
+}
+
+#[derive(Clone, Debug)]
+enum Matcher {
+ /// No literals. (Never advances through the input.)
+ Empty,
+ /// A set of four or more single byte literals.
+ Bytes(SingleByteSet),
+ /// A single substring, using vector accelerated routines when available.
+ Memmem(Memmem),
+ /// An Aho-Corasick automaton.
+ AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
+ /// A packed multiple substring searcher, using SIMD.
+ ///
+ /// Note that Aho-Corasick will actually use this packed searcher
+ /// internally automatically, however, there is some overhead associated
+ /// with going through the Aho-Corasick machinery. So using the packed
+ /// searcher directly results in some gains.
+ Packed { s: packed::Searcher, lits: Vec<Literal> },
+}
+
+impl LiteralSearcher {
+ /// Returns a matcher that never matches and never advances the input.
+ pub fn empty() -> Self {
+ Self::new(Literals::empty(), Matcher::Empty)
+ }
+
+ /// Returns a matcher for literal prefixes from the given set.
+ pub fn prefixes(lits: Literals) -> Self {
+ let matcher = Matcher::prefixes(&lits);
+ Self::new(lits, matcher)
+ }
+
+ /// Returns a matcher for literal suffixes from the given set.
+ pub fn suffixes(lits: Literals) -> Self {
+ let matcher = Matcher::suffixes(&lits);
+ Self::new(lits, matcher)
+ }
+
+ fn new(lits: Literals, matcher: Matcher) -> Self {
+ let complete = lits.all_complete();
+ LiteralSearcher {
+ complete,
+ lcp: Memmem::new(lits.longest_common_prefix()),
+ lcs: Memmem::new(lits.longest_common_suffix()),
+ matcher,
+ }
+ }
+
+ /// Returns true if all matches comprise the entire regular expression.
+ ///
+ /// This does not necessarily mean that a literal match implies a match
+ /// of the regular expression. For example, the regular expression `^a`
+ /// is comprised of a single complete literal `a`, but the regular
+ /// expression demands that it only match at the beginning of a string.
+ pub fn complete(&self) -> bool {
+ self.complete && !self.is_empty()
+ }
+
+ /// Find the position of a literal in `haystack` if it exists.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => Some((0, 0)),
+ Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
+ Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
+ AC { ref ac, .. } => {
+ ac.find(haystack).map(|m| (m.start(), m.end()))
+ }
+ Packed { ref s, .. } => {
+ s.find(haystack).map(|m| (m.start(), m.end()))
+ }
+ }
+ }
+
+ /// Like find, except matches must start at index `0`.
+ pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ for lit in self.iter() {
+ if lit.len() > haystack.len() {
+ continue;
+ }
+ if lit == &haystack[0..lit.len()] {
+ return Some((0, lit.len()));
+ }
+ }
+ None
+ }
+
+ /// Like find, except matches must end at index `haystack.len()`.
+ pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
+ for lit in self.iter() {
+ if lit.len() > haystack.len() {
+ continue;
+ }
+ if lit == &haystack[haystack.len() - lit.len()..] {
+ return Some((haystack.len() - lit.len(), haystack.len()));
+ }
+ }
+ None
+ }
+
+ /// Returns an iterator over all literals to be matched.
+ pub fn iter(&self) -> LiteralIter<'_> {
+ match self.matcher {
+ Matcher::Empty => LiteralIter::Empty,
+ Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
+ Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
+ Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
+ Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
+ }
+ }
+
+ /// Returns a matcher for the longest common prefix of this matcher.
+ pub fn lcp(&self) -> &Memmem {
+ &self.lcp
+ }
+
+ /// Returns a matcher for the longest common suffix of this matcher.
+ pub fn lcs(&self) -> &Memmem {
+ &self.lcs
+ }
+
+ /// Returns true iff this prefix is empty.
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Returns the number of prefixes in this machine.
+ pub fn len(&self) -> usize {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => 0,
+ Bytes(ref sset) => sset.dense.len(),
+ Memmem(_) => 1,
+ AC { ref ac, .. } => ac.pattern_count(),
+ Packed { ref lits, .. } => lits.len(),
+ }
+ }
+
+ /// Return the approximate heap usage of literals in bytes.
+ pub fn approximate_size(&self) -> usize {
+ use self::Matcher::*;
+ match self.matcher {
+ Empty => 0,
+ Bytes(ref sset) => sset.approximate_size(),
+ Memmem(ref single) => single.approximate_size(),
+ AC { ref ac, .. } => ac.heap_bytes(),
+ Packed { ref s, .. } => s.heap_bytes(),
+ }
+ }
+}
+
+impl Matcher {
+ fn prefixes(lits: &Literals) -> Self {
+ let sset = SingleByteSet::prefixes(lits);
+ Matcher::new(lits, sset)
+ }
+
+ fn suffixes(lits: &Literals) -> Self {
+ let sset = SingleByteSet::suffixes(lits);
+ Matcher::new(lits, sset)
+ }
+
+ fn new(lits: &Literals, sset: SingleByteSet) -> Self {
+ if lits.literals().is_empty() {
+ return Matcher::Empty;
+ }
+ if sset.dense.len() >= 26 {
+ // Avoid trying to match a large number of single bytes.
+ // This is *very* sensitive to a frequency analysis comparison
+ // between the bytes in sset and the composition of the haystack.
+ // No matter the size of sset, if its members all are rare in the
+ // haystack, then it'd be worth using it. How to tune this... IDK.
+ // ---AG
+ return Matcher::Empty;
+ }
+ if sset.complete {
+ return Matcher::Bytes(sset);
+ }
+ if lits.literals().len() == 1 {
+ return Matcher::Memmem(Memmem::new(&lits.literals()[0]));
+ }
+
+ let pats = lits.literals().to_owned();
+ let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
+ if lits.literals().len() <= 100 && !is_aho_corasick_fast {
+ let mut builder = packed::Config::new()
+ .match_kind(packed::MatchKind::LeftmostFirst)
+ .builder();
+ if let Some(s) = builder.extend(&pats).build() {
+ return Matcher::Packed { s, lits: pats };
+ }
+ }
+ let ac = AhoCorasickBuilder::new()
+ .match_kind(aho_corasick::MatchKind::LeftmostFirst)
+ .dfa(true)
+ .build_with_size::<u32, _, _>(&pats)
+ .unwrap();
+ Matcher::AC { ac, lits: pats }
+ }
+}
+
+#[derive(Debug)]
+pub enum LiteralIter<'a> {
+ Empty,
+ Bytes(&'a [u8]),
+ Single(&'a [u8]),
+ AC(&'a [Literal]),
+ Packed(&'a [Literal]),
+}
+
+impl<'a> Iterator for LiteralIter<'a> {
+ type Item = &'a [u8];
+
+ fn next(&mut self) -> Option<Self::Item> {
+ match *self {
+ LiteralIter::Empty => None,
+ LiteralIter::Bytes(ref mut many) => {
+ if many.is_empty() {
+ None
+ } else {
+ let next = &many[0..1];
+ *many = &many[1..];
+ Some(next)
+ }
+ }
+ LiteralIter::Single(ref mut one) => {
+ if one.is_empty() {
+ None
+ } else {
+ let next = &one[..];
+ *one = &[];
+ Some(next)
+ }
+ }
+ LiteralIter::AC(ref mut lits) => {
+ if lits.is_empty() {
+ None
+ } else {
+ let next = &lits[0];
+ *lits = &lits[1..];
+ Some(&**next)
+ }
+ }
+ LiteralIter::Packed(ref mut lits) => {
+ if lits.is_empty() {
+ None
+ } else {
+ let next = &lits[0];
+ *lits = &lits[1..];
+ Some(&**next)
+ }
+ }
+ }
+ }
+}
+
+#[derive(Clone, Debug)]
+struct SingleByteSet {
+ sparse: Vec<bool>,
+ dense: Vec<u8>,
+ complete: bool,
+ all_ascii: bool,
+}
+
+impl SingleByteSet {
+ fn new() -> SingleByteSet {
+ SingleByteSet {
+ sparse: vec![false; 256],
+ dense: vec![],
+ complete: true,
+ all_ascii: true,
+ }
+ }
+
+ fn prefixes(lits: &Literals) -> SingleByteSet {
+ let mut sset = SingleByteSet::new();
+ for lit in lits.literals() {
+ sset.complete = sset.complete && lit.len() == 1;
+ if let Some(&b) = lit.get(0) {
+ if !sset.sparse[b as usize] {
+ if b > 0x7F {
+ sset.all_ascii = false;
+ }
+ sset.dense.push(b);
+ sset.sparse[b as usize] = true;
+ }
+ }
+ }
+ sset
+ }
+
+ fn suffixes(lits: &Literals) -> SingleByteSet {
+ let mut sset = SingleByteSet::new();
+ for lit in lits.literals() {
+ sset.complete = sset.complete && lit.len() == 1;
+ if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
+ if !sset.sparse[b as usize] {
+ if b > 0x7F {
+ sset.all_ascii = false;
+ }
+ sset.dense.push(b);
+ sset.sparse[b as usize] = true;
+ }
+ }
+ }
+ sset
+ }
+
+ /// Faster find that special cases certain sizes to use memchr.
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ fn find(&self, text: &[u8]) -> Option<usize> {
+ match self.dense.len() {
+ 0 => None,
+ 1 => memchr(self.dense[0], text),
+ 2 => memchr2(self.dense[0], self.dense[1], text),
+ 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
+ _ => self._find(text),
+ }
+ }
+
+ /// Generic find that works on any sized set.
+ fn _find(&self, haystack: &[u8]) -> Option<usize> {
+ for (i, &b) in haystack.iter().enumerate() {
+ if self.sparse[b as usize] {
+ return Some(i);
+ }
+ }
+ None
+ }
+
+ fn approximate_size(&self) -> usize {
+ (self.dense.len() * mem::size_of::<u8>())
+ + (self.sparse.len() * mem::size_of::<bool>())
+ }
+}
+
+/// A simple wrapper around the memchr crate's memmem implementation.
+///
+/// The API this exposes mirrors the API of previous substring searchers that
+/// this supplanted.
+#[derive(Clone, Debug)]
+pub struct Memmem {
+ finder: memmem::Finder<'static>,
+ char_len: usize,
+}
+
+impl Memmem {
+ fn new(pat: &[u8]) -> Memmem {
+ Memmem {
+ finder: memmem::Finder::new(pat).into_owned(),
+ char_len: char_len_lossy(pat),
+ }
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn find(&self, haystack: &[u8]) -> Option<usize> {
+ self.finder.find(haystack)
+ }
+
+ #[cfg_attr(feature = "perf-inline", inline(always))]
+ pub fn is_suffix(&self, text: &[u8]) -> bool {
+ if text.len() < self.len() {
+ return false;
+ }
+ &text[text.len() - self.len()..] == self.finder.needle()
+ }
+
+ pub fn len(&self) -> usize {
+ self.finder.needle().len()
+ }
+
+ pub fn char_len(&self) -> usize {
+ self.char_len
+ }
+
+ fn approximate_size(&self) -> usize {
+ self.finder.needle().len() * mem::size_of::<u8>()
+ }
+}
+
+fn char_len_lossy(bytes: &[u8]) -> usize {
+ String::from_utf8_lossy(bytes).chars().count()
+}