summaryrefslogtreecommitdiffstats
path: root/vendor/memchr/src/memmem/prefilter/genericsimd.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/memchr/src/memmem/prefilter/genericsimd.rs')
-rw-r--r--vendor/memchr/src/memmem/prefilter/genericsimd.rs207
1 files changed, 0 insertions, 207 deletions
diff --git a/vendor/memchr/src/memmem/prefilter/genericsimd.rs b/vendor/memchr/src/memmem/prefilter/genericsimd.rs
deleted file mode 100644
index 1a6e38734..000000000
--- a/vendor/memchr/src/memmem/prefilter/genericsimd.rs
+++ /dev/null
@@ -1,207 +0,0 @@
-use core::mem::size_of;
-
-use crate::memmem::{
- prefilter::{PrefilterFnTy, PrefilterState},
- vector::Vector,
- NeedleInfo,
-};
-
-/// The implementation of the forward vector accelerated candidate finder.
-///
-/// This is inspired by the "generic SIMD" algorithm described here:
-/// http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd
-///
-/// The main difference is that this is just a prefilter. That is, it reports
-/// candidates once they are seen and doesn't attempt to confirm them. Also,
-/// the bytes this routine uses to check for candidates are selected based on
-/// an a priori background frequency distribution. This means that on most
-/// haystacks, this will on average spend more time in vectorized code than you
-/// would if you just selected the first and last bytes of the needle.
-///
-/// Note that a non-prefilter variant of this algorithm can be found in the
-/// parent module, but it only works on smaller needles.
-///
-/// `prestate`, `ninfo`, `haystack` and `needle` are the four prefilter
-/// function parameters. `fallback` is a prefilter that is used if the haystack
-/// is too small to be handled with the given vector size.
-///
-/// This routine is not safe because it is intended for callers to specialize
-/// this with a particular vector (e.g., __m256i) and then call it with the
-/// relevant target feature (e.g., avx2) enabled.
-///
-/// # Panics
-///
-/// If `needle.len() <= 1`, then this panics.
-///
-/// # Safety
-///
-/// Since this is meant to be used with vector functions, callers need to
-/// specialize this inside of a function with a `target_feature` attribute.
-/// Therefore, callers must ensure that whatever target feature is being used
-/// supports the vector functions that this function is specialized for. (For
-/// the specific vector functions used, see the Vector trait implementations.)
-#[inline(always)]
-pub(crate) unsafe fn find<V: Vector>(
- prestate: &mut PrefilterState,
- ninfo: &NeedleInfo,
- haystack: &[u8],
- needle: &[u8],
- fallback: PrefilterFnTy,
-) -> Option<usize> {
- assert!(needle.len() >= 2, "needle must be at least 2 bytes");
- let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_usize();
- let min_haystack_len = rare2i + size_of::<V>();
- if haystack.len() < min_haystack_len {
- return fallback(prestate, ninfo, haystack, needle);
- }
-
- let start_ptr = haystack.as_ptr();
- let end_ptr = start_ptr.add(haystack.len());
- let max_ptr = end_ptr.sub(min_haystack_len);
- let mut ptr = start_ptr;
-
- let rare1chunk = V::splat(needle[rare1i]);
- let rare2chunk = V::splat(needle[rare2i]);
-
- // N.B. I did experiment with unrolling the loop to deal with size(V)
- // bytes at a time and 2*size(V) bytes at a time. The double unroll
- // was marginally faster while the quadruple unroll was unambiguously
- // slower. In the end, I decided the complexity from unrolling wasn't
- // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to
- // compare.
- while ptr <= max_ptr {
- let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
- if let Some(chunki) = m {
- return Some(matched(prestate, start_ptr, ptr, chunki));
- }
- ptr = ptr.add(size_of::<V>());
- }
- if ptr < end_ptr {
- // This routine immediately quits if a candidate match is found.
- // That means that if we're here, no candidate matches have been
- // found at or before 'ptr'. Thus, we don't need to mask anything
- // out even though we might technically search part of the haystack
- // that we've already searched (because we know it can't match).
- ptr = max_ptr;
- let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
- if let Some(chunki) = m {
- return Some(matched(prestate, start_ptr, ptr, chunki));
- }
- }
- prestate.update(haystack.len());
- None
-}
-
-// Below are two different techniques for checking whether a candidate
-// match exists in a given chunk or not. find_in_chunk2 checks two bytes
-// where as find_in_chunk3 checks three bytes. The idea behind checking
-// three bytes is that while we do a bit more work per iteration, we
-// decrease the chances of a false positive match being reported and thus
-// make the search faster overall. This actually works out for the
-// memmem/krate/prebuilt/huge-en/never-all-common-bytes benchmark, where
-// using find_in_chunk3 is about 25% faster than find_in_chunk2. However,
-// it turns out that find_in_chunk2 is faster for all other benchmarks, so
-// perhaps the extra check isn't worth it in practice.
-//
-// For now, we go with find_in_chunk2, but we leave find_in_chunk3 around
-// to make it easy to switch to and benchmark when possible.
-
-/// Search for an occurrence of two rare bytes from the needle in the current
-/// chunk pointed to by ptr.
-///
-/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
-/// bytes repeated in each 8-bit lane, respectively.
-///
-/// # Safety
-///
-/// It must be safe to do an unaligned read of size(V) bytes starting at both
-/// (ptr + rare1i) and (ptr + rare2i).
-#[inline(always)]
-unsafe fn find_in_chunk2<V: Vector>(
- ptr: *const u8,
- rare1i: usize,
- rare2i: usize,
- rare1chunk: V,
- rare2chunk: V,
-) -> Option<usize> {
- let chunk0 = V::load_unaligned(ptr.add(rare1i));
- let chunk1 = V::load_unaligned(ptr.add(rare2i));
-
- let eq0 = chunk0.cmpeq(rare1chunk);
- let eq1 = chunk1.cmpeq(rare2chunk);
-
- let match_offsets = eq0.and(eq1).movemask();
- if match_offsets == 0 {
- return None;
- }
- Some(match_offsets.trailing_zeros() as usize)
-}
-
-/// Search for an occurrence of two rare bytes and the first byte (even if one
-/// of the rare bytes is equivalent to the first byte) from the needle in the
-/// current chunk pointed to by ptr.
-///
-/// firstchunk, rare1chunk and rare2chunk correspond to vectors with the first,
-/// rare1 and rare2 bytes repeated in each 8-bit lane, respectively.
-///
-/// # Safety
-///
-/// It must be safe to do an unaligned read of size(V) bytes starting at ptr,
-/// (ptr + rare1i) and (ptr + rare2i).
-#[allow(dead_code)]
-#[inline(always)]
-unsafe fn find_in_chunk3<V: Vector>(
- ptr: *const u8,
- rare1i: usize,
- rare2i: usize,
- firstchunk: V,
- rare1chunk: V,
- rare2chunk: V,
-) -> Option<usize> {
- let chunk0 = V::load_unaligned(ptr);
- let chunk1 = V::load_unaligned(ptr.add(rare1i));
- let chunk2 = V::load_unaligned(ptr.add(rare2i));
-
- let eq0 = chunk0.cmpeq(firstchunk);
- let eq1 = chunk1.cmpeq(rare1chunk);
- let eq2 = chunk2.cmpeq(rare2chunk);
-
- let match_offsets = eq0.and(eq1).and(eq2).movemask();
- if match_offsets == 0 {
- return None;
- }
- Some(match_offsets.trailing_zeros() as usize)
-}
-
-/// Accepts a chunk-relative offset and returns a haystack relative offset
-/// after updating the prefilter state.
-///
-/// Why do we use this unlineable function when a search completes? Well,
-/// I don't know. Really. Obviously this function was not here initially.
-/// When doing profiling, the codegen for the inner loop here looked bad and
-/// I didn't know why. There were a couple extra 'add' instructions and an
-/// extra 'lea' instruction that I couldn't explain. I hypothesized that the
-/// optimizer was having trouble untangling the hot code in the loop from the
-/// code that deals with a candidate match. By putting the latter into an
-/// unlineable function, it kind of forces the issue and it had the intended
-/// effect: codegen improved measurably. It's good for a ~10% improvement
-/// across the board on the memmem/krate/prebuilt/huge-en/ benchmarks.
-#[cold]
-#[inline(never)]
-fn matched(
- prestate: &mut PrefilterState,
- start_ptr: *const u8,
- ptr: *const u8,
- chunki: usize,
-) -> usize {
- let found = diff(ptr, start_ptr) + chunki;
- prestate.update(found);
- found
-}
-
-/// Subtract `b` from `a` and return the difference. `a` must be greater than
-/// or equal to `b`.
-fn diff(a: *const u8, b: *const u8) -> usize {
- debug_assert!(a >= b);
- (a as usize) - (b as usize)
-}