use core::{ cmp, fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, u8, }; use alloc::{sync::Arc, vec, vec::Vec}; use crate::{ packed, util::{ alphabet::ByteSet, search::{Match, MatchKind, Span}, }, }; /// A prefilter for accelerating a search. /// /// This crate uses prefilters in the core search implementations to accelerate /// common cases. They typically only apply to cases where there are a small /// number of patterns (less than 100 or so), but when they do, thoughput can /// be boosted considerably, perhaps by an order of magnitude. When a prefilter /// is active, it is used whenever a search enters an automaton's start state. /// /// Currently, prefilters cannot be constructed by /// callers. A `Prefilter` can only be accessed via the /// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter) /// method and used to execute a search. In other words, a prefilter can be /// used to optimize your own search implementation if necessary, but cannot do /// much else. If you have a use case for more APIs, please submit an issue. #[derive(Clone, Debug)] pub struct Prefilter { finder: Arc, memory_usage: usize, } impl Prefilter { /// Execute a search in the haystack within the span given. If a match or /// a possible match is returned, then it is guaranteed to occur within /// the bounds of the span. /// /// If the span provided is invalid for the given haystack, then behavior /// is unspecified. #[inline] pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { self.finder.find_in(haystack, span) } #[inline] pub(crate) fn memory_usage(&self) -> usize { self.memory_usage } } /// A candidate is the result of running a prefilter on a haystack at a /// particular position. /// /// The result is either no match, a confirmed match or a possible match. /// /// When no match is returned, the prefilter is guaranteeing that no possible /// match can be found in the haystack, and the caller may trust this. That is, /// all correct prefilters must never report false negatives. /// /// In some cases, a prefilter can confirm a match very quickly, in which case, /// the caller may use this to stop what it's doing and report the match. In /// this case, prefilter implementations must never report a false positive. /// In other cases, the prefilter can only report a potential match, in which /// case the callers must attempt to confirm the match. In this case, prefilter /// implementations are permitted to return false positives. #[derive(Clone, Debug)] pub enum Candidate { /// No match was found. Since false negatives are not possible, this means /// the search can quit as it is guaranteed not to find another match. None, /// A confirmed match was found. Callers do not need to confirm it. Match(Match), /// The start of a possible match was found. Callers must confirm it before /// reporting it as a match. PossibleStartOfMatch(usize), } impl Candidate { /// Convert this candidate into an option. This is useful when callers /// do not distinguish between true positives and false positives (i.e., /// the caller must always confirm the match). pub fn into_option(self) -> Option { match self { Candidate::None => None, Candidate::Match(ref m) => Some(m.start()), Candidate::PossibleStartOfMatch(start) => Some(start), } } } /// A prefilter describes the behavior of fast literal scanners for quickly /// skipping past bytes in the haystack that we know cannot possibly /// participate in a match. trait PrefilterI: Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static { /// Returns the next possible match candidate. This may yield false /// positives, so callers must confirm a match starting at the position /// returned. This, however, must never produce false negatives. That is, /// this must, at minimum, return the starting position of the next match /// in the given haystack after or at the given position. fn find_in(&self, haystack: &[u8], span: Span) -> Candidate; } impl PrefilterI for Arc

{ #[inline(always)] fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { (**self).find_in(haystack, span) } } /// A builder for constructing the best possible prefilter. When constructed, /// this builder will heuristically select the best prefilter it can build, /// if any, and discard the rest. #[derive(Debug)] pub(crate) struct Builder { count: usize, ascii_case_insensitive: bool, start_bytes: StartBytesBuilder, rare_bytes: RareBytesBuilder, memmem: MemmemBuilder, packed: Option, // If we run across a condition that suggests we shouldn't use a prefilter // at all (like an empty pattern), then disable prefilters entirely. enabled: bool, } impl Builder { /// Create a new builder for constructing the best possible prefilter. pub(crate) fn new(kind: MatchKind) -> Builder { let pbuilder = kind .as_packed() .map(|kind| packed::Config::new().match_kind(kind).builder()); Builder { count: 0, ascii_case_insensitive: false, start_bytes: StartBytesBuilder::new(), rare_bytes: RareBytesBuilder::new(), memmem: MemmemBuilder::default(), packed: pbuilder, enabled: true, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder { self.ascii_case_insensitive = yes; self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); self } /// Return a prefilter suitable for quickly finding potential matches. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. pub(crate) fn build(&self) -> Option { if !self.enabled { debug!("prefilter not enabled, skipping"); return None; } // If we only have one pattern, then deferring to memmem is always // the best choice. This is kind of a weird case, because, well, why // use Aho-Corasick if you only have one pattern? But maybe you don't // know exactly how many patterns you'll get up front, and you need to // support the option of multiple patterns. So instead of relying on // the caller to branch and use memmem explicitly, we just do it for // them. if !self.ascii_case_insensitive { if let Some(pre) = self.memmem.build() { debug!("using memmem prefilter"); return Some(pre); } } let (packed, patlen, minlen) = if self.ascii_case_insensitive { (None, usize::MAX, 0) } else { let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len()); let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len()); let packed = self.packed.as_ref().and_then(|b| b.build()).map(|s| { let memory_usage = s.memory_usage(); debug!( "built packed prefilter (len: {}, \ minimum pattern len: {}, memory usage: {}) \ for consideration", patlen, minlen, memory_usage, ); Prefilter { finder: Arc::new(Packed(s)), memory_usage } }); (packed, patlen, minlen) }; match (self.start_bytes.build(), self.rare_bytes.build()) { // If we could build both start and rare prefilters, then there are // a few cases in which we'd want to use the start-byte prefilter // over the rare-byte prefilter, since the former has lower // overhead. (prestart @ Some(_), prerare @ Some(_)) => { debug!( "both start (len={}, rank={}) and \ rare (len={}, rank={}) byte prefilters \ are available", self.start_bytes.count, self.start_bytes.rank_sum, self.rare_bytes.count, self.rare_bytes.rank_sum, ); if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 && self.rare_bytes.count >= 3 { debug!( "start and rare byte prefilters available, but \ they're probably slower than packed so using \ packed" ); return packed; } // If the start-byte prefilter can scan for a smaller number // of bytes than the rare-byte prefilter, then it's probably // faster. let has_fewer_bytes = self.start_bytes.count < self.rare_bytes.count; // Otherwise, if the combined frequency rank of the detected // bytes in the start-byte prefilter is "close" to the combined // frequency rank of the rare-byte prefilter, then we pick // the start-byte prefilter even if the rare-byte prefilter // heuristically searches for rare bytes. This is because the // rare-byte prefilter has higher constant costs, so we tend to // prefer the start-byte prefilter when we can. let has_rarer_bytes = self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; if has_fewer_bytes { debug!( "using start byte prefilter because it has fewer bytes to search for than the rare byte prefilter", ); prestart } else if has_rarer_bytes { debug!( "using start byte prefilter because its byte \ frequency rank was determined to be \ \"good enough\" relative to the rare byte prefilter \ byte frequency rank", ); prestart } else { debug!("using rare byte prefilter"); prerare } } (prestart @ Some(_), None) => { if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 { debug!( "start byte prefilter available, but \ it's probably slower than packed so using \ packed" ); return packed; } debug!( "have start byte prefilter but not rare byte prefilter, \ so using start byte prefilter", ); prestart } (None, prerare @ Some(_)) => { if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 { debug!( "rare byte prefilter available, but \ it's probably slower than packed so using \ packed" ); return packed; } debug!( "have rare byte prefilter but not start byte prefilter, \ so using rare byte prefilter", ); prerare } (None, None) if self.ascii_case_insensitive => { debug!( "no start or rare byte prefilter and ASCII case \ insensitivity was enabled, so skipping prefilter", ); None } (None, None) => { if packed.is_some() { debug!("falling back to packed prefilter"); } else { debug!("no prefilter available"); } packed } } } /// Add a literal string to this prefilter builder. pub(crate) fn add(&mut self, bytes: &[u8]) { if bytes.is_empty() { self.enabled = false; } if !self.enabled { return; } self.count += 1; self.start_bytes.add(bytes); self.rare_bytes.add(bytes); self.memmem.add(bytes); if let Some(ref mut pbuilder) = self.packed { pbuilder.add(bytes); } } } /// A type that wraps a packed searcher and implements the `Prefilter` /// interface. #[derive(Clone, Debug)] struct Packed(packed::Searcher); impl PrefilterI for Packed { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { self.0 .find_in(&haystack, span) .map_or(Candidate::None, Candidate::Match) } } /// A builder for constructing a prefilter that uses memmem. #[derive(Debug, Default)] struct MemmemBuilder { /// The number of patterns that have been added. count: usize, /// The singular pattern to search for. This is only set when count==1. one: Option>, } impl MemmemBuilder { fn build(&self) -> Option { #[cfg(all(feature = "std", feature = "perf-literal"))] fn imp(builder: &MemmemBuilder) -> Option { let pattern = builder.one.as_ref()?; assert_eq!(1, builder.count); let finder = Arc::new(Memmem( memchr::memmem::Finder::new(pattern).into_owned(), )); let memory_usage = pattern.len(); Some(Prefilter { finder, memory_usage }) } #[cfg(not(all(feature = "std", feature = "perf-literal")))] fn imp(_: &MemmemBuilder) -> Option { None } imp(self) } fn add(&mut self, bytes: &[u8]) { self.count += 1; if self.count == 1 { self.one = Some(bytes.to_vec()); } else { self.one = None; } } } /// A type that wraps a SIMD accelerated single substring search from the /// `memchr` crate for use as a prefilter. /// /// Currently, this prefilter is only active for Aho-Corasick searchers with /// a single pattern. In theory, this could be extended to support searchers /// that have a common prefix of more than one byte (for one byte, we would use /// memchr), but it's not clear if it's worth it or not. /// /// Also, unfortunately, this currently also requires the 'std' feature to /// be enabled. That's because memchr doesn't have a no-std-but-with-alloc /// mode, and so APIs like Finder::into_owned aren't available when 'std' is /// disabled. But there should be an 'alloc' feature that brings in APIs like /// Finder::into_owned but doesn't use std-only features like runtime CPU /// feature detection. #[cfg(all(feature = "std", feature = "perf-literal"))] #[derive(Clone, Debug)] struct Memmem(memchr::memmem::Finder<'static>); #[cfg(all(feature = "std", feature = "perf-literal"))] impl PrefilterI for Memmem { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { use crate::util::primitives::PatternID; self.0.find(&haystack[span]).map_or(Candidate::None, |i| { let start = span.start + i; let end = start + self.0.needle().len(); // N.B. We can declare a match and use a fixed pattern ID here // because a Memmem prefilter is only ever created for searchers // with exactly one pattern. Thus, every match is always a match // and it is always for the first and only pattern. Candidate::Match(Match::new(PatternID::ZERO, start..end)) }) } } /// A builder for constructing a rare byte prefilter. /// /// A rare byte prefilter attempts to pick out a small set of rare bytes that /// occurr in the patterns, and then quickly scan to matches of those rare /// bytes. #[derive(Clone, Debug)] struct RareBytesBuilder { /// Whether this prefilter should account for ASCII case insensitivity or /// not. ascii_case_insensitive: bool, /// A set of rare bytes, indexed by byte value. rare_set: ByteSet, /// A set of byte offsets associated with bytes in a pattern. An entry /// corresponds to a particular bytes (its index) and is only non-zero if /// the byte occurred at an offset greater than 0 in at least one pattern. /// /// If a byte's offset is not representable in 8 bits, then the rare bytes /// prefilter becomes inert. byte_offsets: RareByteOffsets, /// Whether this is available as a prefilter or not. This can be set to /// false during construction if a condition is seen that invalidates the /// use of the rare-byte prefilter. available: bool, /// The number of bytes set to an active value in `byte_offsets`. count: usize, /// The sum of frequency ranks for the rare bytes detected. This is /// intended to give a heuristic notion of how rare the bytes are. rank_sum: u16, } /// A set of byte offsets, keyed by byte. #[derive(Clone, Copy)] struct RareByteOffsets { /// Each entry corresponds to the maximum offset of the corresponding /// byte across all patterns seen. set: [RareByteOffset; 256], } impl RareByteOffsets { /// Create a new empty set of rare byte offsets. pub(crate) fn empty() -> RareByteOffsets { RareByteOffsets { set: [RareByteOffset::default(); 256] } } /// Add the given offset for the given byte to this set. If the offset is /// greater than the existing offset, then it overwrites the previous /// value and returns false. If there is no previous value set, then this /// sets it and returns true. pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) { self.set[byte as usize].max = cmp::max(self.set[byte as usize].max, off.max); } } impl core::fmt::Debug for RareByteOffsets { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut offsets = vec![]; for off in self.set.iter() { if off.max > 0 { offsets.push(off); } } f.debug_struct("RareByteOffsets").field("set", &offsets).finish() } } /// Offsets associated with an occurrence of a "rare" byte in any of the /// patterns used to construct a single Aho-Corasick automaton. #[derive(Clone, Copy, Debug)] struct RareByteOffset { /// The maximum offset at which a particular byte occurs from the start /// of any pattern. This is used as a shift amount. That is, when an /// occurrence of this byte is found, the candidate position reported by /// the prefilter is `position_of_byte - max`, such that the automaton /// will begin its search at a position that is guaranteed to observe a /// match. /// /// To avoid accidentally quadratic behavior, a prefilter is considered /// ineffective when it is asked to start scanning from a position that it /// has already scanned past. /// /// Using a `u8` here means that if we ever see a pattern that's longer /// than 255 bytes, then the entire rare byte prefilter is disabled. max: u8, } impl Default for RareByteOffset { fn default() -> RareByteOffset { RareByteOffset { max: 0 } } } impl RareByteOffset { /// Create a new rare byte offset. If the given offset is too big, then /// None is returned. In that case, callers should render the rare bytes /// prefilter inert. fn new(max: usize) -> Option { if max > u8::MAX as usize { None } else { Some(RareByteOffset { max: max as u8 }) } } } impl RareBytesBuilder { /// Create a new builder for constructing a rare byte prefilter. fn new() -> RareBytesBuilder { RareBytesBuilder { ascii_case_insensitive: false, rare_set: ByteSet::empty(), byte_offsets: RareByteOffsets::empty(), available: true, count: 0, rank_sum: 0, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { self.ascii_case_insensitive = yes; self } /// Build the rare bytes prefilter. /// /// If there are more than 3 distinct rare bytes found, or if heuristics /// otherwise determine that this prefilter should not be used, then `None` /// is returned. fn build(&self) -> Option { #[cfg(feature = "perf-literal")] fn imp(builder: &RareBytesBuilder) -> Option { if !builder.available || builder.count > 3 { return None; } let (mut bytes, mut len) = ([0; 3], 0); for b in 0..=255 { if builder.rare_set.contains(b) { bytes[len] = b as u8; len += 1; } } let finder: Arc = match len { 0 => return None, 1 => Arc::new(RareBytesOne { byte1: bytes[0], offset: builder.byte_offsets.set[bytes[0] as usize], }), 2 => Arc::new(RareBytesTwo { offsets: builder.byte_offsets, byte1: bytes[0], byte2: bytes[1], }), 3 => Arc::new(RareBytesThree { offsets: builder.byte_offsets, byte1: bytes[0], byte2: bytes[1], byte3: bytes[2], }), _ => unreachable!(), }; Some(Prefilter { finder, memory_usage: 0 }) } #[cfg(not(feature = "perf-literal"))] fn imp(_: &RareBytesBuilder) -> Option { None } imp(self) } /// Add a byte string to this builder. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. fn add(&mut self, bytes: &[u8]) { // If we've already given up, then do nothing. if !self.available { return; } // If we've already blown our budget, then don't waste time looking // for more rare bytes. if self.count > 3 { self.available = false; return; } // If the pattern is too long, then our offset table is bunk, so // give up. if bytes.len() >= 256 { self.available = false; return; } let mut rarest = match bytes.get(0) { None => return, Some(&b) => (b, freq_rank(b)), }; // The idea here is to look for the rarest byte in each pattern, and // add that to our set. As a special exception, if we see a byte that // we've already added, then we immediately stop and choose that byte, // even if there's another rare byte in the pattern. This helps us // apply the rare byte optimization in more cases by attempting to pick // bytes that are in common between patterns. So for example, if we // were searching for `Sherlock` and `lockjaw`, then this would pick // `k` for both patterns, resulting in the use of `memchr` instead of // `memchr2` for `k` and `j`. let mut found = false; for (pos, &b) in bytes.iter().enumerate() { self.set_offset(pos, b); if found { continue; } if self.rare_set.contains(b) { found = true; continue; } let rank = freq_rank(b); if rank < rarest.1 { rarest = (b, rank); } } if !found { self.add_rare_byte(rarest.0); } } fn set_offset(&mut self, pos: usize, byte: u8) { // This unwrap is OK because pos is never bigger than our max. let offset = RareByteOffset::new(pos).unwrap(); self.byte_offsets.set(byte, offset); if self.ascii_case_insensitive { self.byte_offsets.set(opposite_ascii_case(byte), offset); } } fn add_rare_byte(&mut self, byte: u8) { self.add_one_rare_byte(byte); if self.ascii_case_insensitive { self.add_one_rare_byte(opposite_ascii_case(byte)); } } fn add_one_rare_byte(&mut self, byte: u8) { if !self.rare_set.contains(byte) { self.rare_set.add(byte); self.count += 1; self.rank_sum += freq_rank(byte) as u16; } } } /// A prefilter for scanning for a single "rare" byte. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct RareBytesOne { byte1: u8, offset: RareByteOffset, } #[cfg(feature = "perf-literal")] impl PrefilterI for RareBytesOne { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr(self.byte1, &haystack[span]) .map(|i| { let pos = span.start + i; cmp::max( span.start, pos.saturating_sub(usize::from(self.offset.max)), ) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A prefilter for scanning for two "rare" bytes. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct RareBytesTwo { offsets: RareByteOffsets, byte1: u8, byte2: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for RareBytesTwo { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr2(self.byte1, self.byte2, &haystack[span]) .map(|i| { let pos = span.start + i; let offset = self.offsets.set[usize::from(haystack[pos])].max; cmp::max(span.start, pos.saturating_sub(usize::from(offset))) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A prefilter for scanning for three "rare" bytes. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct RareBytesThree { offsets: RareByteOffsets, byte1: u8, byte2: u8, byte3: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for RareBytesThree { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) .map(|i| { let pos = span.start + i; let offset = self.offsets.set[usize::from(haystack[pos])].max; cmp::max(span.start, pos.saturating_sub(usize::from(offset))) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A builder for constructing a starting byte prefilter. /// /// A starting byte prefilter is a simplistic prefilter that looks for possible /// matches by reporting all positions corresponding to a particular byte. This /// generally only takes affect when there are at most 3 distinct possible /// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two /// distinct starting bytes (`f` and `b`), and this prefilter returns all /// occurrences of either `f` or `b`. /// /// In some cases, a heuristic frequency analysis may determine that it would /// be better not to use this prefilter even when there are 3 or fewer distinct /// starting bytes. #[derive(Clone, Debug)] struct StartBytesBuilder { /// Whether this prefilter should account for ASCII case insensitivity or /// not. ascii_case_insensitive: bool, /// The set of starting bytes observed. byteset: Vec, /// The number of bytes set to true in `byteset`. count: usize, /// The sum of frequency ranks for the rare bytes detected. This is /// intended to give a heuristic notion of how rare the bytes are. rank_sum: u16, } impl StartBytesBuilder { /// Create a new builder for constructing a start byte prefilter. fn new() -> StartBytesBuilder { StartBytesBuilder { ascii_case_insensitive: false, byteset: vec![false; 256], count: 0, rank_sum: 0, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { self.ascii_case_insensitive = yes; self } /// Build the starting bytes prefilter. /// /// If there are more than 3 distinct starting bytes, or if heuristics /// otherwise determine that this prefilter should not be used, then `None` /// is returned. fn build(&self) -> Option { #[cfg(feature = "perf-literal")] fn imp(builder: &StartBytesBuilder) -> Option { if builder.count > 3 { return None; } let (mut bytes, mut len) = ([0; 3], 0); for b in 0..256 { if !builder.byteset[b] { continue; } // We don't handle non-ASCII bytes for now. Getting non-ASCII // bytes right is trickier, since we generally don't want to put // a leading UTF-8 code unit into a prefilter that isn't ASCII, // since they can frequently. Instead, it would be better to use a // continuation byte, but this requires more sophisticated analysis // of the automaton and a richer prefilter API. if b > 0x7F { return None; } bytes[len] = b as u8; len += 1; } let finder: Arc = match len { 0 => return None, 1 => Arc::new(StartBytesOne { byte1: bytes[0] }), 2 => Arc::new(StartBytesTwo { byte1: bytes[0], byte2: bytes[1], }), 3 => Arc::new(StartBytesThree { byte1: bytes[0], byte2: bytes[1], byte3: bytes[2], }), _ => unreachable!(), }; Some(Prefilter { finder, memory_usage: 0 }) } #[cfg(not(feature = "perf-literal"))] fn imp(_: &StartBytesBuilder) -> Option { None } imp(self) } /// Add a byte string to this builder. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. fn add(&mut self, bytes: &[u8]) { if self.count > 3 { return; } if let Some(&byte) = bytes.get(0) { self.add_one_byte(byte); if self.ascii_case_insensitive { self.add_one_byte(opposite_ascii_case(byte)); } } } fn add_one_byte(&mut self, byte: u8) { if !self.byteset[byte as usize] { self.byteset[byte as usize] = true; self.count += 1; self.rank_sum += freq_rank(byte) as u16; } } } /// A prefilter for scanning for a single starting byte. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct StartBytesOne { byte1: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for StartBytesOne { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr(self.byte1, &haystack[span]) .map(|i| span.start + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A prefilter for scanning for two starting bytes. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct StartBytesTwo { byte1: u8, byte2: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for StartBytesTwo { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr2(self.byte1, self.byte2, &haystack[span]) .map(|i| span.start + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A prefilter for scanning for three starting bytes. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct StartBytesThree { byte1: u8, byte2: u8, byte3: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for StartBytesThree { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) .map(|i| span.start + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// If the given byte is an ASCII letter, then return it in the opposite case. /// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns /// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. pub(crate) fn opposite_ascii_case(b: u8) -> u8 { if b'A' <= b && b <= b'Z' { b.to_ascii_lowercase() } else if b'a' <= b && b <= b'z' { b.to_ascii_uppercase() } else { b } } /// Return the frequency rank of the given byte. The higher the rank, the more /// common the byte (heuristically speaking). fn freq_rank(b: u8) -> u8 { use crate::util::byte_frequencies::BYTE_FREQUENCIES; BYTE_FREQUENCIES[b as usize] }