diff options
Diffstat (limited to 'third_party/rust/memchr/src/memmem/mod.rs')
-rw-r--r-- | third_party/rust/memchr/src/memmem/mod.rs | 1321 |
1 files changed, 1321 insertions, 0 deletions
diff --git a/third_party/rust/memchr/src/memmem/mod.rs b/third_party/rust/memchr/src/memmem/mod.rs new file mode 100644 index 0000000000..e1cd1aec76 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/mod.rs @@ -0,0 +1,1321 @@ +/*! +This module provides forward and reverse substring search routines. + +Unlike the standard library's substring search routines, these work on +arbitrary bytes. For all non-empty needles, these routines will report exactly +the same values as the corresponding routines in the standard library. For +the empty needle, the standard library reports matches only at valid UTF-8 +boundaries, where as these routines will report matches at every position. + +Other than being able to work on arbitrary bytes, the primary reason to prefer +these routines over the standard library routines is that these will generally +be faster. In some cases, significantly so. + +# Example: iterating over substring matches + +This example shows how to use [`find_iter`] to find occurrences of a substring +in a haystack. + +``` +use memchr::memmem; + +let haystack = b"foo bar foo baz foo"; + +let mut it = memmem::find_iter(haystack, "foo"); +assert_eq!(Some(0), it.next()); +assert_eq!(Some(8), it.next()); +assert_eq!(Some(16), it.next()); +assert_eq!(None, it.next()); +``` + +# Example: iterating over substring matches in reverse + +This example shows how to use [`rfind_iter`] to find occurrences of a substring +in a haystack starting from the end of the haystack. + +**NOTE:** This module does not implement double ended iterators, so reverse +searches aren't done by calling `rev` on a forward iterator. + +``` +use memchr::memmem; + +let haystack = b"foo bar foo baz foo"; + +let mut it = memmem::rfind_iter(haystack, "foo"); +assert_eq!(Some(16), it.next()); +assert_eq!(Some(8), it.next()); +assert_eq!(Some(0), it.next()); +assert_eq!(None, it.next()); +``` + +# Example: repeating a search for the same needle + +It may be possible for the overhead of constructing a substring searcher to be +measurable in some workloads. In cases where the same needle is used to search +many haystacks, it is possible to do construction once and thus to avoid it for +subsequent searches. This can be done with a [`Finder`] (or a [`FinderRev`] for +reverse searches). + +``` +use memchr::memmem; + +let finder = memmem::Finder::new("foo"); + +assert_eq!(Some(4), finder.find(b"baz foo quux")); +assert_eq!(None, finder.find(b"quux baz bar")); +``` +*/ + +pub use self::prefilter::Prefilter; + +use crate::{ + cow::CowBytes, + memmem::{ + prefilter::{Pre, PrefilterFn, PrefilterState}, + rabinkarp::NeedleHash, + rarebytes::RareNeedleBytes, + }, +}; + +/// Defines a suite of quickcheck properties for forward and reverse +/// substring searching. +/// +/// This is defined in this specific spot so that it can be used freely among +/// the different substring search implementations. I couldn't be bothered to +/// fight with the macro-visibility rules enough to figure out how to stuff it +/// somewhere more convenient. +#[cfg(all(test, feature = "std"))] +macro_rules! define_memmem_quickcheck_tests { + ($fwd:expr, $rev:expr) => { + use crate::memmem::proptests; + + quickcheck::quickcheck! { + fn qc_fwd_prefix_is_substring(bs: Vec<u8>) -> bool { + proptests::prefix_is_substring(false, &bs, $fwd) + } + + fn qc_fwd_suffix_is_substring(bs: Vec<u8>) -> bool { + proptests::suffix_is_substring(false, &bs, $fwd) + } + + fn qc_fwd_matches_naive( + haystack: Vec<u8>, + needle: Vec<u8> + ) -> bool { + proptests::matches_naive(false, &haystack, &needle, $fwd) + } + + fn qc_rev_prefix_is_substring(bs: Vec<u8>) -> bool { + proptests::prefix_is_substring(true, &bs, $rev) + } + + fn qc_rev_suffix_is_substring(bs: Vec<u8>) -> bool { + proptests::suffix_is_substring(true, &bs, $rev) + } + + fn qc_rev_matches_naive( + haystack: Vec<u8>, + needle: Vec<u8> + ) -> bool { + proptests::matches_naive(true, &haystack, &needle, $rev) + } + } + }; +} + +/// Defines a suite of "simple" hand-written tests for a substring +/// implementation. +/// +/// This is defined here for the same reason that +/// define_memmem_quickcheck_tests is defined here. +#[cfg(test)] +macro_rules! define_memmem_simple_tests { + ($fwd:expr, $rev:expr) => { + use crate::memmem::testsimples; + + #[test] + fn simple_forward() { + testsimples::run_search_tests_fwd($fwd); + } + + #[test] + fn simple_reverse() { + testsimples::run_search_tests_rev($rev); + } + }; +} + +mod byte_frequencies; +#[cfg(memchr_runtime_simd)] +mod genericsimd; +mod prefilter; +mod rabinkarp; +mod rarebytes; +mod twoway; +mod util; +#[cfg(memchr_runtime_simd)] +mod vector; +#[cfg(all(memchr_runtime_wasm128))] +mod wasm; +#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] +mod x86; + +/// Returns an iterator over all non-overlapping occurrences of a substring in +/// a haystack. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar foo baz foo"; +/// let mut it = memmem::find_iter(haystack, b"foo"); +/// assert_eq!(Some(0), it.next()); +/// assert_eq!(Some(8), it.next()); +/// assert_eq!(Some(16), it.next()); +/// assert_eq!(None, it.next()); +/// ``` +#[inline] +pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( + haystack: &'h [u8], + needle: &'n N, +) -> FindIter<'h, 'n> { + FindIter::new(haystack, Finder::new(needle)) +} + +/// Returns a reverse iterator over all non-overlapping occurrences of a +/// substring in a haystack. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar foo baz foo"; +/// let mut it = memmem::rfind_iter(haystack, b"foo"); +/// assert_eq!(Some(16), it.next()); +/// assert_eq!(Some(8), it.next()); +/// assert_eq!(Some(0), it.next()); +/// assert_eq!(None, it.next()); +/// ``` +#[inline] +pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( + haystack: &'h [u8], + needle: &'n N, +) -> FindRevIter<'h, 'n> { + FindRevIter::new(haystack, FinderRev::new(needle)) +} + +/// Returns the index of the first occurrence of the given needle. +/// +/// Note that if you're are searching for the same needle in many different +/// small haystacks, it may be faster to initialize a [`Finder`] once, +/// and reuse it for each search. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar baz"; +/// assert_eq!(Some(0), memmem::find(haystack, b"foo")); +/// assert_eq!(Some(4), memmem::find(haystack, b"bar")); +/// assert_eq!(None, memmem::find(haystack, b"quux")); +/// ``` +#[inline] +pub fn find(haystack: &[u8], needle: &[u8]) -> Option<usize> { + if haystack.len() < 64 { + rabinkarp::find(haystack, needle) + } else { + Finder::new(needle).find(haystack) + } +} + +/// Returns the index of the last occurrence of the given needle. +/// +/// Note that if you're are searching for the same needle in many different +/// small haystacks, it may be faster to initialize a [`FinderRev`] once, +/// and reuse it for each search. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar baz"; +/// assert_eq!(Some(0), memmem::rfind(haystack, b"foo")); +/// assert_eq!(Some(4), memmem::rfind(haystack, b"bar")); +/// assert_eq!(Some(8), memmem::rfind(haystack, b"ba")); +/// assert_eq!(None, memmem::rfind(haystack, b"quux")); +/// ``` +#[inline] +pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { + if haystack.len() < 64 { + rabinkarp::rfind(haystack, needle) + } else { + FinderRev::new(needle).rfind(haystack) + } +} + +/// An iterator over non-overlapping substring matches. +/// +/// Matches are reported by the byte offset at which they begin. +/// +/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the +/// needle. +#[derive(Debug)] +pub struct FindIter<'h, 'n> { + haystack: &'h [u8], + prestate: PrefilterState, + finder: Finder<'n>, + pos: usize, +} + +impl<'h, 'n> FindIter<'h, 'n> { + #[inline(always)] + pub(crate) fn new( + haystack: &'h [u8], + finder: Finder<'n>, + ) -> FindIter<'h, 'n> { + let prestate = finder.searcher.prefilter_state(); + FindIter { haystack, prestate, finder, pos: 0 } + } + + /// Convert this iterator into its owned variant, such that it no longer + /// borrows the finder and needle. + /// + /// If this is already an owned iterator, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> FindIter<'h, 'static> { + FindIter { + haystack: self.haystack, + prestate: self.prestate, + finder: self.finder.into_owned(), + pos: self.pos, + } + } +} + +impl<'h, 'n> Iterator for FindIter<'h, 'n> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + if self.pos > self.haystack.len() { + return None; + } + let result = self + .finder + .searcher + .find(&mut self.prestate, &self.haystack[self.pos..]); + match result { + None => None, + Some(i) => { + let pos = self.pos + i; + self.pos = pos + core::cmp::max(1, self.finder.needle().len()); + Some(pos) + } + } + } +} + +/// An iterator over non-overlapping substring matches in reverse. +/// +/// Matches are reported by the byte offset at which they begin. +/// +/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the +/// needle. +#[derive(Debug)] +pub struct FindRevIter<'h, 'n> { + haystack: &'h [u8], + finder: FinderRev<'n>, + /// When searching with an empty needle, this gets set to `None` after + /// we've yielded the last element at `0`. + pos: Option<usize>, +} + +impl<'h, 'n> FindRevIter<'h, 'n> { + #[inline(always)] + pub(crate) fn new( + haystack: &'h [u8], + finder: FinderRev<'n>, + ) -> FindRevIter<'h, 'n> { + let pos = Some(haystack.len()); + FindRevIter { haystack, finder, pos } + } + + /// Convert this iterator into its owned variant, such that it no longer + /// borrows the finder and needle. + /// + /// If this is already an owned iterator, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> FindRevIter<'h, 'static> { + FindRevIter { + haystack: self.haystack, + finder: self.finder.into_owned(), + pos: self.pos, + } + } +} + +impl<'h, 'n> Iterator for FindRevIter<'h, 'n> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + let pos = match self.pos { + None => return None, + Some(pos) => pos, + }; + let result = self.finder.rfind(&self.haystack[..pos]); + match result { + None => None, + Some(i) => { + if pos == i { + self.pos = pos.checked_sub(1); + } else { + self.pos = Some(i); + } + Some(i) + } + } + } +} + +/// A single substring searcher fixed to a particular needle. +/// +/// The purpose of this type is to permit callers to construct a substring +/// searcher that can be used to search haystacks without the overhead of +/// constructing the searcher in the first place. This is a somewhat niche +/// concern when it's necessary to re-use the same needle to search multiple +/// different haystacks with as little overhead as possible. In general, using +/// [`find`] is good enough, but `Finder` is useful when you can meaningfully +/// observe searcher construction time in a profile. +/// +/// When the `std` feature is enabled, then this type has an `into_owned` +/// version which permits building a `Finder` that is not connected to +/// the lifetime of its needle. +#[derive(Clone, Debug)] +pub struct Finder<'n> { + searcher: Searcher<'n>, +} + +impl<'n> Finder<'n> { + /// Create a new finder for the given needle. + #[inline] + pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> Finder<'n> { + FinderBuilder::new().build_forward(needle) + } + + /// Returns the index of the first occurrence of this needle in the given + /// haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::Finder; + /// + /// let haystack = b"foo bar baz"; + /// assert_eq!(Some(0), Finder::new("foo").find(haystack)); + /// assert_eq!(Some(4), Finder::new("bar").find(haystack)); + /// assert_eq!(None, Finder::new("quux").find(haystack)); + /// ``` + pub fn find(&self, haystack: &[u8]) -> Option<usize> { + self.searcher.find(&mut self.searcher.prefilter_state(), haystack) + } + + /// Returns an iterator over all occurrences of a substring in a haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::Finder; + /// + /// let haystack = b"foo bar foo baz foo"; + /// let finder = Finder::new(b"foo"); + /// let mut it = finder.find_iter(haystack); + /// assert_eq!(Some(0), it.next()); + /// assert_eq!(Some(8), it.next()); + /// assert_eq!(Some(16), it.next()); + /// assert_eq!(None, it.next()); + /// ``` + #[inline] + pub fn find_iter<'a, 'h>( + &'a self, + haystack: &'h [u8], + ) -> FindIter<'h, 'a> { + FindIter::new(haystack, self.as_ref()) + } + + /// Convert this finder into its owned variant, such that it no longer + /// borrows the needle. + /// + /// If this is already an owned finder, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> Finder<'static> { + Finder { searcher: self.searcher.into_owned() } + } + + /// Convert this finder into its borrowed variant. + /// + /// This is primarily useful if your finder is owned and you'd like to + /// store its borrowed variant in some intermediate data structure. + /// + /// Note that the lifetime parameter of the returned finder is tied to the + /// lifetime of `self`, and may be shorter than the `'n` lifetime of the + /// needle itself. Namely, a finder's needle can be either borrowed or + /// owned, so the lifetime of the needle returned must necessarily be the + /// shorter of the two. + #[inline] + pub fn as_ref(&self) -> Finder<'_> { + Finder { searcher: self.searcher.as_ref() } + } + + /// Returns the needle that this finder searches for. + /// + /// Note that the lifetime of the needle returned is tied to the lifetime + /// of the finder, and may be shorter than the `'n` lifetime. Namely, a + /// finder's needle can be either borrowed or owned, so the lifetime of the + /// needle returned must necessarily be the shorter of the two. + #[inline] + pub fn needle(&self) -> &[u8] { + self.searcher.needle() + } +} + +/// A single substring reverse searcher fixed to a particular needle. +/// +/// The purpose of this type is to permit callers to construct a substring +/// searcher that can be used to search haystacks without the overhead of +/// constructing the searcher in the first place. This is a somewhat niche +/// concern when it's necessary to re-use the same needle to search multiple +/// different haystacks with as little overhead as possible. In general, +/// using [`rfind`] is good enough, but `FinderRev` is useful when you can +/// meaningfully observe searcher construction time in a profile. +/// +/// When the `std` feature is enabled, then this type has an `into_owned` +/// version which permits building a `FinderRev` that is not connected to +/// the lifetime of its needle. +#[derive(Clone, Debug)] +pub struct FinderRev<'n> { + searcher: SearcherRev<'n>, +} + +impl<'n> FinderRev<'n> { + /// Create a new reverse finder for the given needle. + #[inline] + pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'n B) -> FinderRev<'n> { + FinderBuilder::new().build_reverse(needle) + } + + /// Returns the index of the last occurrence of this needle in the given + /// haystack. + /// + /// The haystack may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::FinderRev; + /// + /// let haystack = b"foo bar baz"; + /// assert_eq!(Some(0), FinderRev::new("foo").rfind(haystack)); + /// assert_eq!(Some(4), FinderRev::new("bar").rfind(haystack)); + /// assert_eq!(None, FinderRev::new("quux").rfind(haystack)); + /// ``` + pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> { + self.searcher.rfind(haystack.as_ref()) + } + + /// Returns a reverse iterator over all occurrences of a substring in a + /// haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::FinderRev; + /// + /// let haystack = b"foo bar foo baz foo"; + /// let finder = FinderRev::new(b"foo"); + /// let mut it = finder.rfind_iter(haystack); + /// assert_eq!(Some(16), it.next()); + /// assert_eq!(Some(8), it.next()); + /// assert_eq!(Some(0), it.next()); + /// assert_eq!(None, it.next()); + /// ``` + #[inline] + pub fn rfind_iter<'a, 'h>( + &'a self, + haystack: &'h [u8], + ) -> FindRevIter<'h, 'a> { + FindRevIter::new(haystack, self.as_ref()) + } + + /// Convert this finder into its owned variant, such that it no longer + /// borrows the needle. + /// + /// If this is already an owned finder, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> FinderRev<'static> { + FinderRev { searcher: self.searcher.into_owned() } + } + + /// Convert this finder into its borrowed variant. + /// + /// This is primarily useful if your finder is owned and you'd like to + /// store its borrowed variant in some intermediate data structure. + /// + /// Note that the lifetime parameter of the returned finder is tied to the + /// lifetime of `self`, and may be shorter than the `'n` lifetime of the + /// needle itself. Namely, a finder's needle can be either borrowed or + /// owned, so the lifetime of the needle returned must necessarily be the + /// shorter of the two. + #[inline] + pub fn as_ref(&self) -> FinderRev<'_> { + FinderRev { searcher: self.searcher.as_ref() } + } + + /// Returns the needle that this finder searches for. + /// + /// Note that the lifetime of the needle returned is tied to the lifetime + /// of the finder, and may be shorter than the `'n` lifetime. Namely, a + /// finder's needle can be either borrowed or owned, so the lifetime of the + /// needle returned must necessarily be the shorter of the two. + #[inline] + pub fn needle(&self) -> &[u8] { + self.searcher.needle() + } +} + +/// A builder for constructing non-default forward or reverse memmem finders. +/// +/// A builder is primarily useful for configuring a substring searcher. +/// Currently, the only configuration exposed is the ability to disable +/// heuristic prefilters used to speed up certain searches. +#[derive(Clone, Debug, Default)] +pub struct FinderBuilder { + config: SearcherConfig, +} + +impl FinderBuilder { + /// Create a new finder builder with default settings. + pub fn new() -> FinderBuilder { + FinderBuilder::default() + } + + /// Build a forward finder using the given needle from the current + /// settings. + pub fn build_forward<'n, B: ?Sized + AsRef<[u8]>>( + &self, + needle: &'n B, + ) -> Finder<'n> { + Finder { searcher: Searcher::new(self.config, needle.as_ref()) } + } + + /// Build a reverse finder using the given needle from the current + /// settings. + pub fn build_reverse<'n, B: ?Sized + AsRef<[u8]>>( + &self, + needle: &'n B, + ) -> FinderRev<'n> { + FinderRev { searcher: SearcherRev::new(needle.as_ref()) } + } + + /// Configure the prefilter setting for the finder. + /// + /// See the documentation for [`Prefilter`] for more discussion on why + /// you might want to configure this. + pub fn prefilter(&mut self, prefilter: Prefilter) -> &mut FinderBuilder { + self.config.prefilter = prefilter; + self + } +} + +/// The internal implementation of a forward substring searcher. +/// +/// The reality is that this is a "meta" searcher. Namely, depending on a +/// variety of parameters (CPU support, target, needle size, haystack size and +/// even dynamic properties such as prefilter effectiveness), the actual +/// algorithm employed to do substring search may change. +#[derive(Clone, Debug)] +struct Searcher<'n> { + /// The actual needle we're searching for. + /// + /// A CowBytes is like a Cow<[u8]>, except in no_std environments, it is + /// specialized to a single variant (the borrowed form). + needle: CowBytes<'n>, + /// A collection of facts computed on the needle that are useful for more + /// than one substring search algorithm. + ninfo: NeedleInfo, + /// A prefilter function, if it was deemed appropriate. + /// + /// Some substring search implementations (like Two-Way) benefit greatly + /// if we can quickly find candidate starting positions for a match. + prefn: Option<PrefilterFn>, + /// The actual substring implementation in use. + kind: SearcherKind, +} + +/// A collection of facts computed about a search needle. +/// +/// We group these things together because it's useful to be able to hand them +/// to prefilters or substring algorithms that want them. +#[derive(Clone, Copy, Debug)] +pub(crate) struct NeedleInfo { + /// The offsets of "rare" bytes detected in the needle. + /// + /// This is meant to be a heuristic in order to maximize the effectiveness + /// of vectorized code. Namely, vectorized code tends to focus on only + /// one or two bytes. If we pick bytes from the needle that occur + /// infrequently, then more time will be spent in the vectorized code and + /// will likely make the overall search (much) faster. + /// + /// Of course, this is only a heuristic based on a background frequency + /// distribution of bytes. But it tends to work very well in practice. + pub(crate) rarebytes: RareNeedleBytes, + /// A Rabin-Karp hash of the needle. + /// + /// This is store here instead of in a more specific Rabin-Karp search + /// since Rabin-Karp may be used even if another SearchKind corresponds + /// to some other search implementation. e.g., If measurements suggest RK + /// is faster in some cases or if a search implementation can't handle + /// particularly small haystack. (Moreover, we cannot use RK *generally*, + /// since its worst case time is multiplicative. Instead, we only use it + /// some small haystacks, where "small" is a constant.) + pub(crate) nhash: NeedleHash, +} + +/// Configuration for substring search. +#[derive(Clone, Copy, Debug, Default)] +struct SearcherConfig { + /// This permits changing the behavior of the prefilter, since it can have + /// a variable impact on performance. + prefilter: Prefilter, +} + +#[derive(Clone, Debug)] +enum SearcherKind { + /// A special case for empty needles. An empty needle always matches, even + /// in an empty haystack. + Empty, + /// This is used whenever the needle is a single byte. In this case, we + /// always use memchr. + OneByte(u8), + /// Two-Way is the generic work horse and is what provides our additive + /// linear time guarantee. In general, it's used when the needle is bigger + /// than 8 bytes or so. + TwoWay(twoway::Forward), + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] + GenericSIMD128(x86::sse::Forward), + #[cfg(memchr_runtime_wasm128)] + GenericSIMD128(wasm::Forward), + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] + GenericSIMD256(x86::avx::Forward), +} + +impl<'n> Searcher<'n> { + fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> { + use self::SearcherKind::*; + + let ninfo = NeedleInfo::new(needle); + let mk = |kind: SearcherKind| { + let prefn = prefilter::forward( + &config.prefilter, + &ninfo.rarebytes, + needle, + ); + Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind } + }; + if needle.len() == 0 { + return mk(Empty); + } + if needle.len() == 1 { + return mk(OneByte(needle[0])); + } + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] + { + if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) { + return mk(GenericSIMD256(fwd)); + } else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) { + return mk(GenericSIMD128(fwd)); + } + } + #[cfg(all(target_arch = "wasm32", memchr_runtime_simd))] + { + if let Some(fwd) = wasm::Forward::new(&ninfo, needle) { + return mk(GenericSIMD128(fwd)); + } + } + + mk(TwoWay(twoway::Forward::new(needle))) + } + + /// Return a fresh prefilter state that can be used with this searcher. + /// A prefilter state is used to track the effectiveness of a searcher's + /// prefilter for speeding up searches. Therefore, the prefilter state + /// should generally be reused on subsequent searches (such as in an + /// iterator). For searches on a different haystack, then a new prefilter + /// state should be used. + /// + /// This always initializes a valid (but possibly inert) prefilter state + /// even if this searcher does not have a prefilter enabled. + fn prefilter_state(&self) -> PrefilterState { + if self.prefn.is_none() { + PrefilterState::inert() + } else { + PrefilterState::new() + } + } + + fn needle(&self) -> &[u8] { + self.needle.as_slice() + } + + fn as_ref(&self) -> Searcher<'_> { + use self::SearcherKind::*; + + let kind = match self.kind { + Empty => Empty, + OneByte(b) => OneByte(b), + TwoWay(tw) => TwoWay(tw), + #[cfg(all(not(miri), memchr_runtime_simd))] + GenericSIMD128(gs) => GenericSIMD128(gs), + #[cfg(all( + not(miri), + target_arch = "x86_64", + memchr_runtime_simd + ))] + GenericSIMD256(gs) => GenericSIMD256(gs), + }; + Searcher { + needle: CowBytes::new(self.needle()), + ninfo: self.ninfo, + prefn: self.prefn, + kind, + } + } + + #[cfg(feature = "std")] + fn into_owned(self) -> Searcher<'static> { + use self::SearcherKind::*; + + let kind = match self.kind { + Empty => Empty, + OneByte(b) => OneByte(b), + TwoWay(tw) => TwoWay(tw), + #[cfg(all(not(miri), memchr_runtime_simd))] + GenericSIMD128(gs) => GenericSIMD128(gs), + #[cfg(all( + not(miri), + target_arch = "x86_64", + memchr_runtime_simd + ))] + GenericSIMD256(gs) => GenericSIMD256(gs), + }; + Searcher { + needle: self.needle.into_owned(), + ninfo: self.ninfo, + prefn: self.prefn, + kind, + } + } + + /// Implements forward substring search by selecting the implementation + /// chosen at construction and executing it on the given haystack with the + /// prefilter's current state of effectiveness. + #[inline(always)] + fn find( + &self, + state: &mut PrefilterState, + haystack: &[u8], + ) -> Option<usize> { + use self::SearcherKind::*; + + let needle = self.needle(); + if haystack.len() < needle.len() { + return None; + } + match self.kind { + Empty => Some(0), + OneByte(b) => crate::memchr(b, haystack), + TwoWay(ref tw) => { + // For very short haystacks (e.g., where the prefilter probably + // can't run), it's faster to just run RK. + if rabinkarp::is_fast(haystack, needle) { + rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) + } else { + self.find_tw(tw, state, haystack, needle) + } + } + #[cfg(all(not(miri), memchr_runtime_simd))] + GenericSIMD128(ref gs) => { + // The SIMD matcher can't handle particularly short haystacks, + // so we fall back to RK in these cases. + if haystack.len() < gs.min_haystack_len() { + rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) + } else { + gs.find(haystack, needle) + } + } + #[cfg(all( + not(miri), + target_arch = "x86_64", + memchr_runtime_simd + ))] + GenericSIMD256(ref gs) => { + // The SIMD matcher can't handle particularly short haystacks, + // so we fall back to RK in these cases. + if haystack.len() < gs.min_haystack_len() { + rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) + } else { + gs.find(haystack, needle) + } + } + } + } + + /// Calls Two-Way on the given haystack/needle. + /// + /// This is marked as unlineable since it seems to have a better overall + /// effect on benchmarks. However, this is one of those cases where + /// inlining it results an improvement in other benchmarks too, so I + /// suspect we just don't have enough data yet to make the right call here. + /// + /// I suspect the main problem is that this function contains two different + /// inlined copies of Two-Way: one with and one without prefilters enabled. + #[inline(never)] + fn find_tw( + &self, + tw: &twoway::Forward, + state: &mut PrefilterState, + haystack: &[u8], + needle: &[u8], + ) -> Option<usize> { + if let Some(prefn) = self.prefn { + // We used to look at the length of a haystack here. That is, if + // it was too small, then don't bother with the prefilter. But two + // things changed: the prefilter falls back to memchr for small + // haystacks, and, above, Rabin-Karp is employed for tiny haystacks + // anyway. + if state.is_effective() { + let mut pre = Pre { state, prefn, ninfo: &self.ninfo }; + return tw.find(Some(&mut pre), haystack, needle); + } + } + tw.find(None, haystack, needle) + } +} + +impl NeedleInfo { + pub(crate) fn new(needle: &[u8]) -> NeedleInfo { + NeedleInfo { + rarebytes: RareNeedleBytes::forward(needle), + nhash: NeedleHash::forward(needle), + } + } +} + +/// The internal implementation of a reverse substring searcher. +/// +/// See the forward searcher docs for more details. Currently, the reverse +/// searcher is considerably simpler since it lacks prefilter support. This +/// was done because it adds a lot of code, and more surface area to test. And +/// in particular, it's not clear whether a prefilter on reverse searching is +/// worth it. (If you have a compelling use case, please file an issue!) +#[derive(Clone, Debug)] +struct SearcherRev<'n> { + /// The actual needle we're searching for. + needle: CowBytes<'n>, + /// A Rabin-Karp hash of the needle. + nhash: NeedleHash, + /// The actual substring implementation in use. + kind: SearcherRevKind, +} + +#[derive(Clone, Debug)] +enum SearcherRevKind { + /// A special case for empty needles. An empty needle always matches, even + /// in an empty haystack. + Empty, + /// This is used whenever the needle is a single byte. In this case, we + /// always use memchr. + OneByte(u8), + /// Two-Way is the generic work horse and is what provides our additive + /// linear time guarantee. In general, it's used when the needle is bigger + /// than 8 bytes or so. + TwoWay(twoway::Reverse), +} + +impl<'n> SearcherRev<'n> { + fn new(needle: &'n [u8]) -> SearcherRev<'n> { + use self::SearcherRevKind::*; + + let kind = if needle.len() == 0 { + Empty + } else if needle.len() == 1 { + OneByte(needle[0]) + } else { + TwoWay(twoway::Reverse::new(needle)) + }; + SearcherRev { + needle: CowBytes::new(needle), + nhash: NeedleHash::reverse(needle), + kind, + } + } + + fn needle(&self) -> &[u8] { + self.needle.as_slice() + } + + fn as_ref(&self) -> SearcherRev<'_> { + use self::SearcherRevKind::*; + + let kind = match self.kind { + Empty => Empty, + OneByte(b) => OneByte(b), + TwoWay(tw) => TwoWay(tw), + }; + SearcherRev { + needle: CowBytes::new(self.needle()), + nhash: self.nhash, + kind, + } + } + + #[cfg(feature = "std")] + fn into_owned(self) -> SearcherRev<'static> { + use self::SearcherRevKind::*; + + let kind = match self.kind { + Empty => Empty, + OneByte(b) => OneByte(b), + TwoWay(tw) => TwoWay(tw), + }; + SearcherRev { + needle: self.needle.into_owned(), + nhash: self.nhash, + kind, + } + } + + /// Implements reverse substring search by selecting the implementation + /// chosen at construction and executing it on the given haystack with the + /// prefilter's current state of effectiveness. + #[inline(always)] + fn rfind(&self, haystack: &[u8]) -> Option<usize> { + use self::SearcherRevKind::*; + + let needle = self.needle(); + if haystack.len() < needle.len() { + return None; + } + match self.kind { + Empty => Some(haystack.len()), + OneByte(b) => crate::memrchr(b, haystack), + TwoWay(ref tw) => { + // For very short haystacks (e.g., where the prefilter probably + // can't run), it's faster to just run RK. + if rabinkarp::is_fast(haystack, needle) { + rabinkarp::rfind_with(&self.nhash, haystack, needle) + } else { + tw.rfind(haystack, needle) + } + } + } + } +} + +/// This module defines some generic quickcheck properties useful for testing +/// any substring search algorithm. It also runs those properties for the +/// top-level public API memmem routines. (The properties are also used to +/// test various substring search implementations more granularly elsewhere as +/// well.) +#[cfg(all(test, feature = "std", not(miri)))] +mod proptests { + // N.B. This defines the quickcheck tests using the properties defined + // below. Because of macro-visibility weirdness, the actual macro is + // defined at the top of this file. + define_memmem_quickcheck_tests!(super::find, super::rfind); + + /// Check that every prefix of the given byte string is a substring. + pub(crate) fn prefix_is_substring( + reverse: bool, + bs: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, + ) -> bool { + if bs.is_empty() { + return true; + } + for i in 0..(bs.len() - 1) { + let prefix = &bs[..i]; + if reverse { + assert_eq!(naive_rfind(bs, prefix), search(bs, prefix)); + } else { + assert_eq!(naive_find(bs, prefix), search(bs, prefix)); + } + } + true + } + + /// Check that every suffix of the given byte string is a substring. + pub(crate) fn suffix_is_substring( + reverse: bool, + bs: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, + ) -> bool { + if bs.is_empty() { + return true; + } + for i in 0..(bs.len() - 1) { + let suffix = &bs[i..]; + if reverse { + assert_eq!(naive_rfind(bs, suffix), search(bs, suffix)); + } else { + assert_eq!(naive_find(bs, suffix), search(bs, suffix)); + } + } + true + } + + /// Check that naive substring search matches the result of the given search + /// algorithm. + pub(crate) fn matches_naive( + reverse: bool, + haystack: &[u8], + needle: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, + ) -> bool { + if reverse { + naive_rfind(haystack, needle) == search(haystack, needle) + } else { + naive_find(haystack, needle) == search(haystack, needle) + } + } + + /// Naively search forwards for the given needle in the given haystack. + fn naive_find(haystack: &[u8], needle: &[u8]) -> Option<usize> { + if needle.is_empty() { + return Some(0); + } else if haystack.len() < needle.len() { + return None; + } + for i in 0..(haystack.len() - needle.len() + 1) { + if needle == &haystack[i..i + needle.len()] { + return Some(i); + } + } + None + } + + /// Naively search in reverse for the given needle in the given haystack. + fn naive_rfind(haystack: &[u8], needle: &[u8]) -> Option<usize> { + if needle.is_empty() { + return Some(haystack.len()); + } else if haystack.len() < needle.len() { + return None; + } + for i in (0..(haystack.len() - needle.len() + 1)).rev() { + if needle == &haystack[i..i + needle.len()] { + return Some(i); + } + } + None + } +} + +/// This module defines some hand-written "simple" substring tests. It +/// also provides routines for easily running them on any substring search +/// implementation. +#[cfg(test)] +mod testsimples { + define_memmem_simple_tests!(super::find, super::rfind); + + /// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple. + type SearchTest = + (&'static str, &'static str, Option<usize>, Option<usize>); + + const SEARCH_TESTS: &'static [SearchTest] = &[ + ("", "", Some(0), Some(0)), + ("", "a", Some(0), Some(1)), + ("", "ab", Some(0), Some(2)), + ("", "abc", Some(0), Some(3)), + ("a", "", None, None), + ("a", "a", Some(0), Some(0)), + ("a", "aa", Some(0), Some(1)), + ("a", "ba", Some(1), Some(1)), + ("a", "bba", Some(2), Some(2)), + ("a", "bbba", Some(3), Some(3)), + ("a", "bbbab", Some(3), Some(3)), + ("a", "bbbabb", Some(3), Some(3)), + ("a", "bbbabbb", Some(3), Some(3)), + ("a", "bbbbbb", None, None), + ("ab", "", None, None), + ("ab", "a", None, None), + ("ab", "b", None, None), + ("ab", "ab", Some(0), Some(0)), + ("ab", "aab", Some(1), Some(1)), + ("ab", "aaab", Some(2), Some(2)), + ("ab", "abaab", Some(0), Some(3)), + ("ab", "baaab", Some(3), Some(3)), + ("ab", "acb", None, None), + ("ab", "abba", Some(0), Some(0)), + ("abc", "ab", None, None), + ("abc", "abc", Some(0), Some(0)), + ("abc", "abcz", Some(0), Some(0)), + ("abc", "abczz", Some(0), Some(0)), + ("abc", "zabc", Some(1), Some(1)), + ("abc", "zzabc", Some(2), Some(2)), + ("abc", "azbc", None, None), + ("abc", "abzc", None, None), + ("abczdef", "abczdefzzzzzzzzzzzzzzzzzzzz", Some(0), Some(0)), + ("abczdef", "zzzzzzzzzzzzzzzzzzzzabczdef", Some(20), Some(20)), + ("xyz", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaxyz", Some(32), Some(32)), + // Failures caught by quickcheck. + ("\u{0}\u{15}", "\u{0}\u{15}\u{15}\u{0}", Some(0), Some(0)), + ("\u{0}\u{1e}", "\u{1e}\u{0}", None, None), + ]; + + /// Run the substring search tests. `search` should be a closure that + /// accepts a haystack and a needle and returns the starting position + /// of the first occurrence of needle in the haystack, or `None` if one + /// doesn't exist. + pub(crate) fn run_search_tests_fwd( + mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, + ) { + for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS { + let (n, h) = (needle.as_bytes(), haystack.as_bytes()); + assert_eq!( + expected_fwd, + search(h, n), + "needle: {:?}, haystack: {:?}, expected: {:?}", + n, + h, + expected_fwd + ); + } + } + + /// Run the substring search tests. `search` should be a closure that + /// accepts a haystack and a needle and returns the starting position of + /// the last occurrence of needle in the haystack, or `None` if one doesn't + /// exist. + pub(crate) fn run_search_tests_rev( + mut search: impl FnMut(&[u8], &[u8]) -> Option<usize>, + ) { + for &(needle, haystack, _, expected_rev) in SEARCH_TESTS { + let (n, h) = (needle.as_bytes(), haystack.as_bytes()); + assert_eq!( + expected_rev, + search(h, n), + "needle: {:?}, haystack: {:?}, expected: {:?}", + n, + h, + expected_rev + ); + } + } +} |