diff options
Diffstat (limited to 'vendor/aho-corasick/src/packed/pattern.rs')
-rw-r--r-- | vendor/aho-corasick/src/packed/pattern.rs | 318 |
1 files changed, 318 insertions, 0 deletions
diff --git a/vendor/aho-corasick/src/packed/pattern.rs b/vendor/aho-corasick/src/packed/pattern.rs new file mode 100644 index 000000000..f4c6756fc --- /dev/null +++ b/vendor/aho-corasick/src/packed/pattern.rs @@ -0,0 +1,318 @@ +use std::cmp; +use std::fmt; +use std::mem; +use std::u16; +use std::usize; + +use crate::packed::api::MatchKind; + +/// The type used for representing a pattern identifier. +/// +/// We don't use `usize` here because our packed searchers don't scale to +/// huge numbers of patterns, so we keep things a bit smaller. +pub type PatternID = u16; + +/// A non-empty collection of non-empty patterns to search for. +/// +/// This collection of patterns is what is passed around to both execute +/// searches and to construct the searchers themselves. Namely, this permits +/// searches to avoid copying all of the patterns, and allows us to keep only +/// one copy throughout all packed searchers. +/// +/// Note that this collection is not a set. The same pattern can appear more +/// than once. +#[derive(Clone, Debug)] +pub struct Patterns { + /// The match semantics supported by this collection of patterns. + /// + /// The match semantics determines the order of the iterator over patterns. + /// For leftmost-first, patterns are provided in the same order as were + /// provided by the caller. For leftmost-longest, patterns are provided in + /// descending order of length, with ties broken by the order in which they + /// were provided by the caller. + kind: MatchKind, + /// The collection of patterns, indexed by their identifier. + by_id: Vec<Vec<u8>>, + /// The order of patterns defined for iteration, given by pattern + /// identifiers. The order of `by_id` and `order` is always the same for + /// leftmost-first semantics, but may be different for leftmost-longest + /// semantics. + order: Vec<PatternID>, + /// The length of the smallest pattern, in bytes. + minimum_len: usize, + /// The largest pattern identifier. This should always be equivalent to + /// the number of patterns minus one in this collection. + max_pattern_id: PatternID, + /// The total number of pattern bytes across the entire collection. This + /// is used for reporting total heap usage in constant time. + total_pattern_bytes: usize, +} + +impl Patterns { + /// Create a new collection of patterns for the given match semantics. The + /// ID of each pattern is the index of the pattern at which it occurs in + /// the `by_id` slice. + /// + /// If any of the patterns in the slice given are empty, then this panics. + /// Similarly, if the number of patterns given is zero, then this also + /// panics. + pub fn new() -> Patterns { + Patterns { + kind: MatchKind::default(), + by_id: vec![], + order: vec![], + minimum_len: usize::MAX, + max_pattern_id: 0, + total_pattern_bytes: 0, + } + } + + /// Add a pattern to this collection. + /// + /// This panics if the pattern given is empty. + pub fn add(&mut self, bytes: &[u8]) { + assert!(!bytes.is_empty()); + assert!(self.by_id.len() <= u16::MAX as usize); + + let id = self.by_id.len() as u16; + self.max_pattern_id = id; + self.order.push(id); + self.by_id.push(bytes.to_vec()); + self.minimum_len = cmp::min(self.minimum_len, bytes.len()); + self.total_pattern_bytes += bytes.len(); + } + + /// Set the match kind semantics for this collection of patterns. + /// + /// If the kind is not set, then the default is leftmost-first. + pub fn set_match_kind(&mut self, kind: MatchKind) { + match kind { + MatchKind::LeftmostFirst => { + self.order.sort(); + } + MatchKind::LeftmostLongest => { + let (order, by_id) = (&mut self.order, &mut self.by_id); + order.sort_by(|&id1, &id2| { + by_id[id1 as usize] + .len() + .cmp(&by_id[id2 as usize].len()) + .reverse() + }); + } + MatchKind::__Nonexhaustive => unreachable!(), + } + } + + /// Return the number of patterns in this collection. + /// + /// This is guaranteed to be greater than zero. + pub fn len(&self) -> usize { + self.by_id.len() + } + + /// Returns true if and only if this collection of patterns is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the approximate total amount of heap used by these patterns, in + /// units of bytes. + pub fn heap_bytes(&self) -> usize { + self.order.len() * mem::size_of::<PatternID>() + + self.by_id.len() * mem::size_of::<Vec<u8>>() + + self.total_pattern_bytes + } + + /// Clears all heap memory associated with this collection of patterns and + /// resets all state such that it is a valid empty collection. + pub fn reset(&mut self) { + self.kind = MatchKind::default(); + self.by_id.clear(); + self.order.clear(); + self.minimum_len = usize::MAX; + self.max_pattern_id = 0; + } + + /// Return the maximum pattern identifier in this collection. This can be + /// useful in searchers for ensuring that the collection of patterns they + /// are provided at search time and at build time have the same size. + pub fn max_pattern_id(&self) -> PatternID { + assert_eq!((self.max_pattern_id + 1) as usize, self.len()); + self.max_pattern_id + } + + /// Returns the length, in bytes, of the smallest pattern. + /// + /// This is guaranteed to be at least one. + pub fn minimum_len(&self) -> usize { + self.minimum_len + } + + /// Returns the match semantics used by these patterns. + pub fn match_kind(&self) -> &MatchKind { + &self.kind + } + + /// Return the pattern with the given identifier. If such a pattern does + /// not exist, then this panics. + pub fn get(&self, id: PatternID) -> Pattern<'_> { + Pattern(&self.by_id[id as usize]) + } + + /// Return the pattern with the given identifier without performing bounds + /// checks. + /// + /// # Safety + /// + /// Callers must ensure that a pattern with the given identifier exists + /// before using this method. + #[cfg(target_arch = "x86_64")] + pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> { + Pattern(self.by_id.get_unchecked(id as usize)) + } + + /// Return an iterator over all the patterns in this collection, in the + /// order in which they should be matched. + /// + /// Specifically, in a naive multi-pattern matcher, the following is + /// guaranteed to satisfy the match semantics of this collection of + /// patterns: + /// + /// ```ignore + /// for i in 0..haystack.len(): + /// for p in patterns.iter(): + /// if haystack[i..].starts_with(p.bytes()): + /// return Match(p.id(), i, i + p.bytes().len()) + /// ``` + /// + /// Namely, among the patterns in a collection, if they are matched in + /// the order provided by this iterator, then the result is guaranteed + /// to satisfy the correct match semantics. (Either leftmost-first or + /// leftmost-longest.) + pub fn iter(&self) -> PatternIter<'_> { + PatternIter { patterns: self, i: 0 } + } +} + +/// An iterator over the patterns in the `Patterns` collection. +/// +/// The order of the patterns provided by this iterator is consistent with the +/// match semantics of the originating collection of patterns. +/// +/// The lifetime `'p` corresponds to the lifetime of the collection of patterns +/// this is iterating over. +#[derive(Debug)] +pub struct PatternIter<'p> { + patterns: &'p Patterns, + i: usize, +} + +impl<'p> Iterator for PatternIter<'p> { + type Item = (PatternID, Pattern<'p>); + + fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> { + if self.i >= self.patterns.len() { + return None; + } + let id = self.patterns.order[self.i]; + let p = self.patterns.get(id); + self.i += 1; + Some((id, p)) + } +} + +/// A pattern that is used in packed searching. +#[derive(Clone)] +pub struct Pattern<'a>(&'a [u8]); + +impl<'a> fmt::Debug for Pattern<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Pattern") + .field("lit", &String::from_utf8_lossy(&self.0)) + .finish() + } +} + +impl<'p> Pattern<'p> { + /// Returns the length of this pattern, in bytes. + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns the bytes of this pattern. + pub fn bytes(&self) -> &[u8] { + &self.0 + } + + /// Returns the first `len` low nybbles from this pattern. If this pattern + /// is shorter than `len`, then this panics. + #[cfg(target_arch = "x86_64")] + pub fn low_nybbles(&self, len: usize) -> Vec<u8> { + let mut nybs = vec![]; + for &b in self.bytes().iter().take(len) { + nybs.push(b & 0xF); + } + nybs + } + + /// Returns true if this pattern is a prefix of the given bytes. + #[inline(always)] + pub fn is_prefix(&self, bytes: &[u8]) -> bool { + self.len() <= bytes.len() && self.equals(&bytes[..self.len()]) + } + + /// Returns true if and only if this pattern equals the given bytes. + #[inline(always)] + pub fn equals(&self, bytes: &[u8]) -> bool { + // Why not just use memcmp for this? Well, memcmp requires calling out + // to libc, and this routine is called in fairly hot code paths. Other + // than just calling out to libc, it also seems to result in worse + // codegen. By rolling our own memcpy in pure Rust, it seems to appear + // more friendly to the optimizer. + // + // This results in an improvement in just about every benchmark. Some + // smaller than others, but in some cases, up to 30% faster. + + if self.len() != bytes.len() { + return false; + } + if self.len() < 8 { + for (&b1, &b2) in self.bytes().iter().zip(bytes) { + if b1 != b2 { + return false; + } + } + return true; + } + // When we have 8 or more bytes to compare, then proceed in chunks of + // 8 at a time using unaligned loads. + let mut p1 = self.bytes().as_ptr(); + let mut p2 = bytes.as_ptr(); + let p1end = self.bytes()[self.len() - 8..].as_ptr(); + let p2end = bytes[bytes.len() - 8..].as_ptr(); + // SAFETY: Via the conditional above, we know that both `p1` and `p2` + // have the same length, so `p1 < p1end` implies that `p2 < p2end`. + // Thus, derefencing both `p1` and `p2` in the loop below is safe. + // + // Moreover, we set `p1end` and `p2end` to be 8 bytes before the actual + // end of of `p1` and `p2`. Thus, the final dereference outside of the + // loop is guaranteed to be valid. + // + // Finally, we needn't worry about 64-bit alignment here, since we + // do unaligned loads. + unsafe { + while p1 < p1end { + let v1 = (p1 as *const u64).read_unaligned(); + let v2 = (p2 as *const u64).read_unaligned(); + if v1 != v2 { + return false; + } + p1 = p1.add(8); + p2 = p2.add(8); + } + let v1 = (p1end as *const u64).read_unaligned(); + let v2 = (p2end as *const u64).read_unaligned(); + v1 == v2 + } + } +} |