From 43a97878ce14b72f0981164f87f2e35e14151312 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 11:22:09 +0200 Subject: Adding upstream version 110.0.1. Signed-off-by: Daniel Baumann --- third_party/rust/memchr/src/cow.rs | 97 ++ third_party/rust/memchr/src/lib.rs | 181 +++ third_party/rust/memchr/src/memchr/c.rs | 44 + third_party/rust/memchr/src/memchr/fallback.rs | 329 +++++ third_party/rust/memchr/src/memchr/iter.rs | 173 +++ third_party/rust/memchr/src/memchr/mod.rs | 410 ++++++ third_party/rust/memchr/src/memchr/naive.rs | 25 + third_party/rust/memchr/src/memchr/x86/avx.rs | 755 +++++++++++ third_party/rust/memchr/src/memchr/x86/mod.rs | 148 +++ third_party/rust/memchr/src/memchr/x86/sse2.rs | 791 ++++++++++++ third_party/rust/memchr/src/memchr/x86/sse42.rs | 72 ++ .../rust/memchr/src/memmem/byte_frequencies.rs | 258 ++++ third_party/rust/memchr/src/memmem/genericsimd.rs | 266 ++++ third_party/rust/memchr/src/memmem/mod.rs | 1321 ++++++++++++++++++++ .../rust/memchr/src/memmem/prefilter/fallback.rs | 122 ++ .../memchr/src/memmem/prefilter/genericsimd.rs | 207 +++ .../rust/memchr/src/memmem/prefilter/mod.rs | 570 +++++++++ .../rust/memchr/src/memmem/prefilter/wasm.rs | 39 + .../rust/memchr/src/memmem/prefilter/x86/avx.rs | 46 + .../rust/memchr/src/memmem/prefilter/x86/mod.rs | 5 + .../rust/memchr/src/memmem/prefilter/x86/sse.rs | 42 + third_party/rust/memchr/src/memmem/rabinkarp.rs | 233 ++++ third_party/rust/memchr/src/memmem/rarebytes.rs | 136 ++ third_party/rust/memchr/src/memmem/twoway.rs | 878 +++++++++++++ third_party/rust/memchr/src/memmem/util.rs | 88 ++ third_party/rust/memchr/src/memmem/vector.rs | 131 ++ third_party/rust/memchr/src/memmem/wasm.rs | 75 ++ third_party/rust/memchr/src/memmem/x86/avx.rs | 139 ++ third_party/rust/memchr/src/memmem/x86/mod.rs | 2 + third_party/rust/memchr/src/memmem/x86/sse.rs | 89 ++ third_party/rust/memchr/src/tests/memchr/iter.rs | 230 ++++ third_party/rust/memchr/src/tests/memchr/memchr.rs | 134 ++ third_party/rust/memchr/src/tests/memchr/mod.rs | 7 + third_party/rust/memchr/src/tests/memchr/simple.rs | 23 + .../rust/memchr/src/tests/memchr/testdata.rs | 351 ++++++ third_party/rust/memchr/src/tests/mod.rs | 15 + .../rust/memchr/src/tests/x86_64-soft_float.json | 15 + 37 files changed, 8447 insertions(+) create mode 100644 third_party/rust/memchr/src/cow.rs create mode 100644 third_party/rust/memchr/src/lib.rs create mode 100644 third_party/rust/memchr/src/memchr/c.rs create mode 100644 third_party/rust/memchr/src/memchr/fallback.rs create mode 100644 third_party/rust/memchr/src/memchr/iter.rs create mode 100644 third_party/rust/memchr/src/memchr/mod.rs create mode 100644 third_party/rust/memchr/src/memchr/naive.rs create mode 100644 third_party/rust/memchr/src/memchr/x86/avx.rs create mode 100644 third_party/rust/memchr/src/memchr/x86/mod.rs create mode 100644 third_party/rust/memchr/src/memchr/x86/sse2.rs create mode 100644 third_party/rust/memchr/src/memchr/x86/sse42.rs create mode 100644 third_party/rust/memchr/src/memmem/byte_frequencies.rs create mode 100644 third_party/rust/memchr/src/memmem/genericsimd.rs create mode 100644 third_party/rust/memchr/src/memmem/mod.rs create mode 100644 third_party/rust/memchr/src/memmem/prefilter/fallback.rs create mode 100644 third_party/rust/memchr/src/memmem/prefilter/genericsimd.rs create mode 100644 third_party/rust/memchr/src/memmem/prefilter/mod.rs create mode 100644 third_party/rust/memchr/src/memmem/prefilter/wasm.rs create mode 100644 third_party/rust/memchr/src/memmem/prefilter/x86/avx.rs create mode 100644 third_party/rust/memchr/src/memmem/prefilter/x86/mod.rs create mode 100644 third_party/rust/memchr/src/memmem/prefilter/x86/sse.rs create mode 100644 third_party/rust/memchr/src/memmem/rabinkarp.rs create mode 100644 third_party/rust/memchr/src/memmem/rarebytes.rs create mode 100644 third_party/rust/memchr/src/memmem/twoway.rs create mode 100644 third_party/rust/memchr/src/memmem/util.rs create mode 100644 third_party/rust/memchr/src/memmem/vector.rs create mode 100644 third_party/rust/memchr/src/memmem/wasm.rs create mode 100644 third_party/rust/memchr/src/memmem/x86/avx.rs create mode 100644 third_party/rust/memchr/src/memmem/x86/mod.rs create mode 100644 third_party/rust/memchr/src/memmem/x86/sse.rs create mode 100644 third_party/rust/memchr/src/tests/memchr/iter.rs create mode 100644 third_party/rust/memchr/src/tests/memchr/memchr.rs create mode 100644 third_party/rust/memchr/src/tests/memchr/mod.rs create mode 100644 third_party/rust/memchr/src/tests/memchr/simple.rs create mode 100644 third_party/rust/memchr/src/tests/memchr/testdata.rs create mode 100644 third_party/rust/memchr/src/tests/mod.rs create mode 100644 third_party/rust/memchr/src/tests/x86_64-soft_float.json (limited to 'third_party/rust/memchr/src') diff --git a/third_party/rust/memchr/src/cow.rs b/third_party/rust/memchr/src/cow.rs new file mode 100644 index 0000000000..0b7d0dad09 --- /dev/null +++ b/third_party/rust/memchr/src/cow.rs @@ -0,0 +1,97 @@ +use core::ops; + +/// A specialized copy-on-write byte string. +/// +/// The purpose of this type is to permit usage of a "borrowed or owned +/// byte string" in a way that keeps std/no-std compatibility. That is, in +/// no-std mode, this type devolves into a simple &[u8] with no owned variant +/// available. We can't just use a plain Cow because Cow is not in core. +#[derive(Clone, Debug)] +pub struct CowBytes<'a>(Imp<'a>); + +// N.B. We don't use std::borrow::Cow here since we can get away with a +// Box<[u8]> for our use case, which is 1/3 smaller than the Vec that +// a Cow<[u8]> would use. +#[cfg(feature = "std")] +#[derive(Clone, Debug)] +enum Imp<'a> { + Borrowed(&'a [u8]), + Owned(Box<[u8]>), +} + +#[cfg(not(feature = "std"))] +#[derive(Clone, Debug)] +struct Imp<'a>(&'a [u8]); + +impl<'a> ops::Deref for CowBytes<'a> { + type Target = [u8]; + + #[inline(always)] + fn deref(&self) -> &[u8] { + self.as_slice() + } +} + +impl<'a> CowBytes<'a> { + /// Create a new borrowed CowBytes. + #[inline(always)] + pub fn new>(bytes: &'a B) -> CowBytes<'a> { + CowBytes(Imp::new(bytes.as_ref())) + } + + /// Create a new owned CowBytes. + #[cfg(feature = "std")] + #[inline(always)] + pub fn new_owned(bytes: Box<[u8]>) -> CowBytes<'static> { + CowBytes(Imp::Owned(bytes)) + } + + /// Return a borrowed byte string, regardless of whether this is an owned + /// or borrowed byte string internally. + #[inline(always)] + pub fn as_slice(&self) -> &[u8] { + self.0.as_slice() + } + + /// Return an owned version of this copy-on-write byte string. + /// + /// If this is already an owned byte string internally, then this is a + /// no-op. Otherwise, the internal byte string is copied. + #[cfg(feature = "std")] + #[inline(always)] + pub fn into_owned(self) -> CowBytes<'static> { + match self.0 { + Imp::Borrowed(b) => CowBytes::new_owned(Box::from(b)), + Imp::Owned(b) => CowBytes::new_owned(b), + } + } +} + +impl<'a> Imp<'a> { + #[cfg(feature = "std")] + #[inline(always)] + pub fn new(bytes: &'a [u8]) -> Imp<'a> { + Imp::Borrowed(bytes) + } + + #[cfg(not(feature = "std"))] + #[inline(always)] + pub fn new(bytes: &'a [u8]) -> Imp<'a> { + Imp(bytes) + } + + #[cfg(feature = "std")] + #[inline(always)] + pub fn as_slice(&self) -> &[u8] { + match self { + Imp::Owned(ref x) => x, + Imp::Borrowed(x) => x, + } + } + + #[cfg(not(feature = "std"))] + #[inline(always)] + pub fn as_slice(&self) -> &[u8] { + self.0 + } +} diff --git a/third_party/rust/memchr/src/lib.rs b/third_party/rust/memchr/src/lib.rs new file mode 100644 index 0000000000..e0b4ce3fda --- /dev/null +++ b/third_party/rust/memchr/src/lib.rs @@ -0,0 +1,181 @@ +/*! +This library provides heavily optimized routines for string search primitives. + +# Overview + +This section gives a brief high level overview of what this crate offers. + +* The top-level module provides routines for searching for 1, 2 or 3 bytes + in the forward or reverse direction. When searching for more than one byte, + positions are considered a match if the byte at that position matches any + of the bytes. +* The [`memmem`] sub-module provides forward and reverse substring search + routines. + +In all such cases, routines operate on `&[u8]` without regard to encoding. This +is exactly what you want when searching either UTF-8 or arbitrary bytes. + +# Example: using `memchr` + +This example shows how to use `memchr` to find the first occurrence of `z` in +a haystack: + +``` +use memchr::memchr; + +let haystack = b"foo bar baz quuz"; +assert_eq!(Some(10), memchr(b'z', haystack)); +``` + +# Example: matching one of three possible bytes + +This examples shows how to use `memrchr3` to find occurrences of `a`, `b` or +`c`, starting at the end of the haystack. + +``` +use memchr::memchr3_iter; + +let haystack = b"xyzaxyzbxyzc"; + +let mut it = memchr3_iter(b'a', b'b', b'c', haystack).rev(); +assert_eq!(Some(11), it.next()); +assert_eq!(Some(7), it.next()); +assert_eq!(Some(3), it.next()); +assert_eq!(None, it.next()); +``` + +# Example: iterating over substring matches + +This example shows how to use the [`memmem`] sub-module to find occurrences of +a substring in a haystack. + +``` +use memchr::memmem; + +let haystack = b"foo bar foo baz foo"; + +let mut it = memmem::find_iter(haystack, "foo"); +assert_eq!(Some(0), it.next()); +assert_eq!(Some(8), it.next()); +assert_eq!(Some(16), it.next()); +assert_eq!(None, it.next()); +``` + +# Example: repeating a search for the same needle + +It may be possible for the overhead of constructing a substring searcher to be +measurable in some workloads. In cases where the same needle is used to search +many haystacks, it is possible to do construction once and thus to avoid it for +subsequent searches. This can be done with a [`memmem::Finder`]: + +``` +use memchr::memmem; + +let finder = memmem::Finder::new("foo"); + +assert_eq!(Some(4), finder.find(b"baz foo quux")); +assert_eq!(None, finder.find(b"quux baz bar")); +``` + +# Why use this crate? + +At first glance, the APIs provided by this crate might seem weird. Why provide +a dedicated routine like `memchr` for something that could be implemented +clearly and trivially in one line: + +``` +fn memchr(needle: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == needle) +} +``` + +Or similarly, why does this crate provide substring search routines when Rust's +core library already provides them? + +``` +fn search(haystack: &str, needle: &str) -> Option { + haystack.find(needle) +} +``` + +The primary reason for both of them to exist is performance. When it comes to +performance, at a high level at least, there are two primary ways to look at +it: + +* **Throughput**: For this, think about it as, "given some very large haystack + and a byte that never occurs in that haystack, how long does it take to + search through it and determine that it, in fact, does not occur?" +* **Latency**: For this, think about it as, "given a tiny haystack---just a + few bytes---how long does it take to determine if a byte is in it?" + +The `memchr` routine in this crate has _slightly_ worse latency than the +solution presented above, however, its throughput can easily be over an +order of magnitude faster. This is a good general purpose trade off to make. +You rarely lose, but often gain big. + +**NOTE:** The name `memchr` comes from the corresponding routine in libc. A key +advantage of using this library is that its performance is not tied to its +quality of implementation in the libc you happen to be using, which can vary +greatly from platform to platform. + +But what about substring search? This one is a bit more complicated. The +primary reason for its existence is still indeed performance, but it's also +useful because Rust's core library doesn't actually expose any substring +search routine on arbitrary bytes. The only substring search routine that +exists works exclusively on valid UTF-8. + +So if you have valid UTF-8, is there a reason to use this over the standard +library substring search routine? Yes. This routine is faster on almost every +metric, including latency. The natural question then, is why isn't this +implementation in the standard library, even if only for searching on UTF-8? +The reason is that the implementation details for using SIMD in the standard +library haven't quite been worked out yet. + +**NOTE:** Currently, only `x86_64` targets have highly accelerated +implementations of substring search. For `memchr`, all targets have +somewhat-accelerated implementations, while only `x86_64` targets have highly +accelerated implementations. This limitation is expected to be lifted once the +standard library exposes a platform independent SIMD API. + +# Crate features + +* **std** - When enabled (the default), this will permit this crate to use + features specific to the standard library. Currently, the only thing used + from the standard library is runtime SIMD CPU feature detection. This means + that this feature must be enabled to get AVX accelerated routines. When + `std` is not enabled, this crate will still attempt to use SSE2 accelerated + routines on `x86_64`. +* **libc** - When enabled (**not** the default), this library will use your + platform's libc implementation of `memchr` (and `memrchr` on Linux). This + can be useful on non-`x86_64` targets where the fallback implementation in + this crate is not as good as the one found in your libc. All other routines + (e.g., `memchr[23]` and substring search) unconditionally use the + implementation in this crate. +*/ + +#![deny(missing_docs)] +#![cfg_attr(not(feature = "std"), no_std)] +// It's not worth trying to gate all code on just miri, so turn off relevant +// dead code warnings. +#![cfg_attr(miri, allow(dead_code, unused_macros))] + +// Supporting 8-bit (or others) would be fine. If you need it, please submit a +// bug report at https://github.com/BurntSushi/memchr +#[cfg(not(any( + target_pointer_width = "16", + target_pointer_width = "32", + target_pointer_width = "64" +)))] +compile_error!("memchr currently not supported on non-{16,32,64}"); + +pub use crate::memchr::{ + memchr, memchr2, memchr2_iter, memchr3, memchr3_iter, memchr_iter, + memrchr, memrchr2, memrchr2_iter, memrchr3, memrchr3_iter, memrchr_iter, + Memchr, Memchr2, Memchr3, +}; + +mod cow; +mod memchr; +pub mod memmem; +#[cfg(test)] +mod tests; diff --git a/third_party/rust/memchr/src/memchr/c.rs b/third_party/rust/memchr/src/memchr/c.rs new file mode 100644 index 0000000000..608aabc980 --- /dev/null +++ b/third_party/rust/memchr/src/memchr/c.rs @@ -0,0 +1,44 @@ +// This module defines safe wrappers around memchr (POSIX) and memrchr (GNU +// extension). + +#![allow(dead_code)] + +use libc::{c_int, c_void, size_t}; + +pub fn memchr(needle: u8, haystack: &[u8]) -> Option { + // SAFETY: This is safe to call since all pointers are valid. + let p = unsafe { + libc::memchr( + haystack.as_ptr() as *const c_void, + needle as c_int, + haystack.len() as size_t, + ) + }; + if p.is_null() { + None + } else { + Some(p as usize - (haystack.as_ptr() as usize)) + } +} + +// memrchr is a GNU extension. We know it's available on Linux at least. +#[cfg(target_os = "linux")] +pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { + // GNU's memrchr() will - unlike memchr() - error if haystack is empty. + if haystack.is_empty() { + return None; + } + // SAFETY: This is safe to call since all pointers are valid. + let p = unsafe { + libc::memrchr( + haystack.as_ptr() as *const c_void, + needle as c_int, + haystack.len() as size_t, + ) + }; + if p.is_null() { + None + } else { + Some(p as usize - (haystack.as_ptr() as usize)) + } +} diff --git a/third_party/rust/memchr/src/memchr/fallback.rs b/third_party/rust/memchr/src/memchr/fallback.rs new file mode 100644 index 0000000000..b01f224fa5 --- /dev/null +++ b/third_party/rust/memchr/src/memchr/fallback.rs @@ -0,0 +1,329 @@ +// This module defines pure Rust platform independent implementations of all +// the memchr routines. We do our best to make them fast. Some of them may even +// get auto-vectorized. + +use core::{cmp, usize}; + +#[cfg(target_pointer_width = "16")] +const USIZE_BYTES: usize = 2; + +#[cfg(target_pointer_width = "32")] +const USIZE_BYTES: usize = 4; + +#[cfg(target_pointer_width = "64")] +const USIZE_BYTES: usize = 8; + +// The number of bytes to loop at in one iteration of memchr/memrchr. +const LOOP_SIZE: usize = 2 * USIZE_BYTES; + +/// Return `true` if `x` contains any zero byte. +/// +/// From *Matters Computational*, J. Arndt +/// +/// "The idea is to subtract one from each of the bytes and then look for +/// bytes where the borrow propagated all the way to the most significant +/// bit." +#[inline(always)] +fn contains_zero_byte(x: usize) -> bool { + const LO_U64: u64 = 0x0101010101010101; + const HI_U64: u64 = 0x8080808080808080; + + const LO_USIZE: usize = LO_U64 as usize; + const HI_USIZE: usize = HI_U64 as usize; + + x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 +} + +/// Repeat the given byte into a word size number. That is, every 8 bits +/// is equivalent to the given byte. For example, if `b` is `\x4E` or +/// `01001110` in binary, then the returned value on a 32-bit system would be: +/// `01001110_01001110_01001110_01001110`. +#[inline(always)] +fn repeat_byte(b: u8) -> usize { + (b as usize) * (usize::MAX / 255) +} + +pub fn memchr(n1: u8, haystack: &[u8]) -> Option { + let vn1 = repeat_byte(n1); + let confirm = |byte| byte == n1; + let loop_size = cmp::min(LOOP_SIZE, haystack.len()); + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let mut ptr = start_ptr; + + unsafe { + let end_ptr = start_ptr.add(haystack.len()); + if haystack.len() < USIZE_BYTES { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr as *const usize).read_unaligned(); + if contains_zero_byte(chunk ^ vn1) { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); + debug_assert!(ptr > start_ptr); + debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); + while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let a = *(ptr as *const usize); + let b = *(ptr.add(USIZE_BYTES) as *const usize); + let eqa = contains_zero_byte(a ^ vn1); + let eqb = contains_zero_byte(b ^ vn1); + if eqa || eqb { + break; + } + ptr = ptr.add(LOOP_SIZE); + } + forward_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Like `memchr`, but searches for two bytes instead of one. +pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + let vn1 = repeat_byte(n1); + let vn2 = repeat_byte(n2); + let confirm = |byte| byte == n1 || byte == n2; + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let mut ptr = start_ptr; + + unsafe { + let end_ptr = start_ptr.add(haystack.len()); + if haystack.len() < USIZE_BYTES { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr as *const usize).read_unaligned(); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + if eq1 || eq2 { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); + debug_assert!(ptr > start_ptr); + debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); + while ptr <= end_ptr.sub(USIZE_BYTES) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let chunk = *(ptr as *const usize); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + if eq1 || eq2 { + break; + } + ptr = ptr.add(USIZE_BYTES); + } + forward_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Like `memchr`, but searches for three bytes instead of one. +pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + let vn1 = repeat_byte(n1); + let vn2 = repeat_byte(n2); + let vn3 = repeat_byte(n3); + let confirm = |byte| byte == n1 || byte == n2 || byte == n3; + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + let mut ptr = start_ptr; + + unsafe { + let end_ptr = start_ptr.add(haystack.len()); + if haystack.len() < USIZE_BYTES { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr as *const usize).read_unaligned(); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + let eq3 = contains_zero_byte(chunk ^ vn3); + if eq1 || eq2 || eq3 { + return forward_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = ptr.add(USIZE_BYTES - (start_ptr as usize & align)); + debug_assert!(ptr > start_ptr); + debug_assert!(end_ptr.sub(USIZE_BYTES) >= start_ptr); + while ptr <= end_ptr.sub(USIZE_BYTES) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let chunk = *(ptr as *const usize); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + let eq3 = contains_zero_byte(chunk ^ vn3); + if eq1 || eq2 || eq3 { + break; + } + ptr = ptr.add(USIZE_BYTES); + } + forward_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Return the last index matching the byte `x` in `text`. +pub fn memrchr(n1: u8, haystack: &[u8]) -> Option { + let vn1 = repeat_byte(n1); + let confirm = |byte| byte == n1; + let loop_size = cmp::min(LOOP_SIZE, haystack.len()); + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + + unsafe { + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + if haystack.len() < USIZE_BYTES { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); + if contains_zero_byte(chunk ^ vn1) { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = (end_ptr as usize & !align) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let a = *(ptr.sub(2 * USIZE_BYTES) as *const usize); + let b = *(ptr.sub(1 * USIZE_BYTES) as *const usize); + let eqa = contains_zero_byte(a ^ vn1); + let eqb = contains_zero_byte(b ^ vn1); + if eqa || eqb { + break; + } + ptr = ptr.sub(loop_size); + } + reverse_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Like `memrchr`, but searches for two bytes instead of one. +pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + let vn1 = repeat_byte(n1); + let vn2 = repeat_byte(n2); + let confirm = |byte| byte == n1 || byte == n2; + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + + unsafe { + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + if haystack.len() < USIZE_BYTES { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + if eq1 || eq2 { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = (end_ptr as usize & !align) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while ptr >= start_ptr.add(USIZE_BYTES) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let chunk = *(ptr.sub(USIZE_BYTES) as *const usize); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + if eq1 || eq2 { + break; + } + ptr = ptr.sub(USIZE_BYTES); + } + reverse_search(start_ptr, end_ptr, ptr, confirm) + } +} + +/// Like `memrchr`, but searches for three bytes instead of one. +pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + let vn1 = repeat_byte(n1); + let vn2 = repeat_byte(n2); + let vn3 = repeat_byte(n3); + let confirm = |byte| byte == n1 || byte == n2 || byte == n3; + let align = USIZE_BYTES - 1; + let start_ptr = haystack.as_ptr(); + + unsafe { + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + if haystack.len() < USIZE_BYTES { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + let chunk = (ptr.sub(USIZE_BYTES) as *const usize).read_unaligned(); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + let eq3 = contains_zero_byte(chunk ^ vn3); + if eq1 || eq2 || eq3 { + return reverse_search(start_ptr, end_ptr, ptr, confirm); + } + + ptr = (end_ptr as usize & !align) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while ptr >= start_ptr.add(USIZE_BYTES) { + debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); + + let chunk = *(ptr.sub(USIZE_BYTES) as *const usize); + let eq1 = contains_zero_byte(chunk ^ vn1); + let eq2 = contains_zero_byte(chunk ^ vn2); + let eq3 = contains_zero_byte(chunk ^ vn3); + if eq1 || eq2 || eq3 { + break; + } + ptr = ptr.sub(USIZE_BYTES); + } + reverse_search(start_ptr, end_ptr, ptr, confirm) + } +} + +#[inline(always)] +unsafe fn forward_search bool>( + start_ptr: *const u8, + end_ptr: *const u8, + mut ptr: *const u8, + confirm: F, +) -> Option { + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr); + + while ptr < end_ptr { + if confirm(*ptr) { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + None +} + +#[inline(always)] +unsafe fn reverse_search bool>( + start_ptr: *const u8, + end_ptr: *const u8, + mut ptr: *const u8, + confirm: F, +) -> Option { + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr); + + while ptr > start_ptr { + ptr = ptr.offset(-1); + if confirm(*ptr) { + return Some(sub(ptr, start_ptr)); + } + } + None +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/memchr/iter.rs b/third_party/rust/memchr/src/memchr/iter.rs new file mode 100644 index 0000000000..16e203f635 --- /dev/null +++ b/third_party/rust/memchr/src/memchr/iter.rs @@ -0,0 +1,173 @@ +use crate::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; + +macro_rules! iter_next { + // Common code for the memchr iterators: + // update haystack and position and produce the index + // + // self: &mut Self where Self is the iterator + // search_result: Option which is the result of the corresponding + // memchr function. + // + // Returns Option (the next iterator element) + ($self_:expr, $search_result:expr) => { + $search_result.map(move |index| { + // split and take the remaining back half + $self_.haystack = $self_.haystack.split_at(index + 1).1; + let found_position = $self_.position + index; + $self_.position = found_position + 1; + found_position + }) + }; +} + +macro_rules! iter_next_back { + ($self_:expr, $search_result:expr) => { + $search_result.map(move |index| { + // split and take the remaining front half + $self_.haystack = $self_.haystack.split_at(index).0; + $self_.position + index + }) + }; +} + +/// An iterator for `memchr`. +pub struct Memchr<'a> { + needle: u8, + // The haystack to iterate over + haystack: &'a [u8], + // The index + position: usize, +} + +impl<'a> Memchr<'a> { + /// Creates a new iterator that yields all positions of needle in haystack. + #[inline] + pub fn new(needle: u8, haystack: &[u8]) -> Memchr<'_> { + Memchr { needle: needle, haystack: haystack, position: 0 } + } +} + +impl<'a> Iterator for Memchr<'a> { + type Item = usize; + + #[inline] + fn next(&mut self) -> Option { + iter_next!(self, memchr(self.needle, self.haystack)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.haystack.len())) + } +} + +impl<'a> DoubleEndedIterator for Memchr<'a> { + #[inline] + fn next_back(&mut self) -> Option { + iter_next_back!(self, memrchr(self.needle, self.haystack)) + } +} + +/// An iterator for `memchr2`. +pub struct Memchr2<'a> { + needle1: u8, + needle2: u8, + // The haystack to iterate over + haystack: &'a [u8], + // The index + position: usize, +} + +impl<'a> Memchr2<'a> { + /// Creates a new iterator that yields all positions of needle in haystack. + #[inline] + pub fn new(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2<'_> { + Memchr2 { + needle1: needle1, + needle2: needle2, + haystack: haystack, + position: 0, + } + } +} + +impl<'a> Iterator for Memchr2<'a> { + type Item = usize; + + #[inline] + fn next(&mut self) -> Option { + iter_next!(self, memchr2(self.needle1, self.needle2, self.haystack)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.haystack.len())) + } +} + +impl<'a> DoubleEndedIterator for Memchr2<'a> { + #[inline] + fn next_back(&mut self) -> Option { + iter_next_back!( + self, + memrchr2(self.needle1, self.needle2, self.haystack) + ) + } +} + +/// An iterator for `memchr3`. +pub struct Memchr3<'a> { + needle1: u8, + needle2: u8, + needle3: u8, + // The haystack to iterate over + haystack: &'a [u8], + // The index + position: usize, +} + +impl<'a> Memchr3<'a> { + /// Create a new `Memchr3` that's initialized to zero with a haystack + #[inline] + pub fn new( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], + ) -> Memchr3<'_> { + Memchr3 { + needle1: needle1, + needle2: needle2, + needle3: needle3, + haystack: haystack, + position: 0, + } + } +} + +impl<'a> Iterator for Memchr3<'a> { + type Item = usize; + + #[inline] + fn next(&mut self) -> Option { + iter_next!( + self, + memchr3(self.needle1, self.needle2, self.needle3, self.haystack) + ) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.haystack.len())) + } +} + +impl<'a> DoubleEndedIterator for Memchr3<'a> { + #[inline] + fn next_back(&mut self) -> Option { + iter_next_back!( + self, + memrchr3(self.needle1, self.needle2, self.needle3, self.haystack) + ) + } +} diff --git a/third_party/rust/memchr/src/memchr/mod.rs b/third_party/rust/memchr/src/memchr/mod.rs new file mode 100644 index 0000000000..09ce6ef3cd --- /dev/null +++ b/third_party/rust/memchr/src/memchr/mod.rs @@ -0,0 +1,410 @@ +use core::iter::Rev; + +pub use self::iter::{Memchr, Memchr2, Memchr3}; + +// N.B. If you're looking for the cfg knobs for libc, see build.rs. +#[cfg(memchr_libc)] +mod c; +#[allow(dead_code)] +pub mod fallback; +mod iter; +pub mod naive; +#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] +mod x86; + +/// An iterator over all occurrences of the needle in a haystack. +#[inline] +pub fn memchr_iter(needle: u8, haystack: &[u8]) -> Memchr<'_> { + Memchr::new(needle, haystack) +} + +/// An iterator over all occurrences of the needles in a haystack. +#[inline] +pub fn memchr2_iter(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2<'_> { + Memchr2::new(needle1, needle2, haystack) +} + +/// An iterator over all occurrences of the needles in a haystack. +#[inline] +pub fn memchr3_iter( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], +) -> Memchr3<'_> { + Memchr3::new(needle1, needle2, needle3, haystack) +} + +/// An iterator over all occurrences of the needle in a haystack, in reverse. +#[inline] +pub fn memrchr_iter(needle: u8, haystack: &[u8]) -> Rev> { + Memchr::new(needle, haystack).rev() +} + +/// An iterator over all occurrences of the needles in a haystack, in reverse. +#[inline] +pub fn memrchr2_iter( + needle1: u8, + needle2: u8, + haystack: &[u8], +) -> Rev> { + Memchr2::new(needle1, needle2, haystack).rev() +} + +/// An iterator over all occurrences of the needles in a haystack, in reverse. +#[inline] +pub fn memrchr3_iter( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], +) -> Rev> { + Memchr3::new(needle1, needle2, needle3, haystack).rev() +} + +/// Search for the first occurrence of a byte in a slice. +/// +/// This returns the index corresponding to the first occurrence of `needle` in +/// `haystack`, or `None` if one is not found. If an index is returned, it is +/// guaranteed to be less than `usize::MAX`. +/// +/// While this is operationally the same as something like +/// `haystack.iter().position(|&b| b == needle)`, `memchr` will use a highly +/// optimized routine that can be up to an order of magnitude faster in some +/// cases. +/// +/// # Example +/// +/// This shows how to find the first position of a byte in a byte string. +/// +/// ``` +/// use memchr::memchr; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memchr(b'k', haystack), Some(8)); +/// ``` +#[inline] +pub fn memchr(needle: u8, haystack: &[u8]) -> Option { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option { + naive::memchr(n1, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option { + x86::memchr(n1, haystack) + } + + #[cfg(all( + memchr_libc, + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option { + c::memchr(n1, haystack) + } + + #[cfg(all( + not(memchr_libc), + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option { + fallback::memchr(n1, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle, haystack) + } +} + +/// Like `memchr`, but searches for either of two bytes instead of just one. +/// +/// This returns the index corresponding to the first occurrence of `needle1` +/// or the first occurrence of `needle2` in `haystack` (whichever occurs +/// earlier), or `None` if neither one is found. If an index is returned, it is +/// guaranteed to be less than `usize::MAX`. +/// +/// While this is operationally the same as something like +/// `haystack.iter().position(|&b| b == needle1 || b == needle2)`, `memchr2` +/// will use a highly optimized routine that can be up to an order of magnitude +/// faster in some cases. +/// +/// # Example +/// +/// This shows how to find the first position of either of two bytes in a byte +/// string. +/// +/// ``` +/// use memchr::memchr2; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memchr2(b'k', b'q', haystack), Some(4)); +/// ``` +#[inline] +pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { + naive::memchr2(n1, n2, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { + x86::memchr2(n1, n2, haystack) + } + + #[cfg(all( + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { + fallback::memchr2(n1, n2, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle1, needle2, haystack) + } +} + +/// Like `memchr`, but searches for any of three bytes instead of just one. +/// +/// This returns the index corresponding to the first occurrence of `needle1`, +/// the first occurrence of `needle2`, or the first occurrence of `needle3` in +/// `haystack` (whichever occurs earliest), or `None` if none are found. If an +/// index is returned, it is guaranteed to be less than `usize::MAX`. +/// +/// While this is operationally the same as something like +/// `haystack.iter().position(|&b| b == needle1 || b == needle2 || +/// b == needle3)`, `memchr3` will use a highly optimized routine that can be +/// up to an order of magnitude faster in some cases. +/// +/// # Example +/// +/// This shows how to find the first position of any of three bytes in a byte +/// string. +/// +/// ``` +/// use memchr::memchr3; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memchr3(b'k', b'q', b'e', haystack), Some(2)); +/// ``` +#[inline] +pub fn memchr3( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], +) -> Option { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + naive::memchr3(n1, n2, n3, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + x86::memchr3(n1, n2, n3, haystack) + } + + #[cfg(all( + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + fallback::memchr3(n1, n2, n3, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle1, needle2, needle3, haystack) + } +} + +/// Search for the last occurrence of a byte in a slice. +/// +/// This returns the index corresponding to the last occurrence of `needle` in +/// `haystack`, or `None` if one is not found. If an index is returned, it is +/// guaranteed to be less than `usize::MAX`. +/// +/// While this is operationally the same as something like +/// `haystack.iter().rposition(|&b| b == needle)`, `memrchr` will use a highly +/// optimized routine that can be up to an order of magnitude faster in some +/// cases. +/// +/// # Example +/// +/// This shows how to find the last position of a byte in a byte string. +/// +/// ``` +/// use memchr::memrchr; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memrchr(b'o', haystack), Some(17)); +/// ``` +#[inline] +pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option { + naive::memrchr(n1, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option { + x86::memrchr(n1, haystack) + } + + #[cfg(all( + memchr_libc, + target_os = "linux", + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri) + ))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option { + c::memrchr(n1, haystack) + } + + #[cfg(all( + not(all(memchr_libc, target_os = "linux")), + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, haystack: &[u8]) -> Option { + fallback::memrchr(n1, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle, haystack) + } +} + +/// Like `memrchr`, but searches for either of two bytes instead of just one. +/// +/// This returns the index corresponding to the last occurrence of `needle1` or +/// the last occurrence of `needle2` in `haystack` (whichever occurs later), or +/// `None` if neither one is found. If an index is returned, it is guaranteed +/// to be less than `usize::MAX`. +/// +/// While this is operationally the same as something like +/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2)`, `memrchr2` +/// will use a highly optimized routine that can be up to an order of magnitude +/// faster in some cases. +/// +/// # Example +/// +/// This shows how to find the last position of either of two bytes in a byte +/// string. +/// +/// ``` +/// use memchr::memrchr2; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memrchr2(b'k', b'q', haystack), Some(8)); +/// ``` +#[inline] +pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { + naive::memrchr2(n1, n2, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { + x86::memrchr2(n1, n2, haystack) + } + + #[cfg(all( + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { + fallback::memrchr2(n1, n2, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle1, needle2, haystack) + } +} + +/// Like `memrchr`, but searches for any of three bytes instead of just one. +/// +/// This returns the index corresponding to the last occurrence of `needle1`, +/// the last occurrence of `needle2`, or the last occurrence of `needle3` in +/// `haystack` (whichever occurs later), or `None` if none are found. If an +/// index is returned, it is guaranteed to be less than `usize::MAX`. +/// +/// While this is operationally the same as something like +/// `haystack.iter().rposition(|&b| b == needle1 || b == needle2 || +/// b == needle3)`, `memrchr3` will use a highly optimized routine that can be +/// up to an order of magnitude faster in some cases. +/// +/// # Example +/// +/// This shows how to find the last position of any of three bytes in a byte +/// string. +/// +/// ``` +/// use memchr::memrchr3; +/// +/// let haystack = b"the quick brown fox"; +/// assert_eq!(memrchr3(b'k', b'q', b'e', haystack), Some(8)); +/// ``` +#[inline] +pub fn memrchr3( + needle1: u8, + needle2: u8, + needle3: u8, + haystack: &[u8], +) -> Option { + #[cfg(miri)] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + naive::memrchr3(n1, n2, n3, haystack) + } + + #[cfg(all(target_arch = "x86_64", memchr_runtime_simd, not(miri)))] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + x86::memrchr3(n1, n2, n3, haystack) + } + + #[cfg(all( + not(all(target_arch = "x86_64", memchr_runtime_simd)), + not(miri), + ))] + #[inline(always)] + fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + fallback::memrchr3(n1, n2, n3, haystack) + } + + if haystack.is_empty() { + None + } else { + imp(needle1, needle2, needle3, haystack) + } +} diff --git a/third_party/rust/memchr/src/memchr/naive.rs b/third_party/rust/memchr/src/memchr/naive.rs new file mode 100644 index 0000000000..3f3053d481 --- /dev/null +++ b/third_party/rust/memchr/src/memchr/naive.rs @@ -0,0 +1,25 @@ +#![allow(dead_code)] + +pub fn memchr(n1: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == n1) +} + +pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == n1 || b == n2) +} + +pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + haystack.iter().position(|&b| b == n1 || b == n2 || b == n3) +} + +pub fn memrchr(n1: u8, haystack: &[u8]) -> Option { + haystack.iter().rposition(|&b| b == n1) +} + +pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + haystack.iter().rposition(|&b| b == n1 || b == n2) +} + +pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3) +} diff --git a/third_party/rust/memchr/src/memchr/x86/avx.rs b/third_party/rust/memchr/src/memchr/x86/avx.rs new file mode 100644 index 0000000000..535123097c --- /dev/null +++ b/third_party/rust/memchr/src/memchr/x86/avx.rs @@ -0,0 +1,755 @@ +use core::{arch::x86_64::*, cmp, mem::size_of}; + +use super::sse2; + +const VECTOR_SIZE: usize = size_of::<__m256i>(); +const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; + +// The number of bytes to loop at in one iteration of memchr/memrchr. +const LOOP_SIZE: usize = 4 * VECTOR_SIZE; + +// The number of bytes to loop at in one iteration of memchr2/memrchr2 and +// memchr3/memrchr3. There was no observable difference between 128 and 64 +// bytes in benchmarks. memchr3 in particular only gets a very slight speed up +// from the loop unrolling. +const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; + +#[target_feature(enable = "avx2")] +pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option { + // For a high level explanation for how this algorithm works, see the + // sse2 implementation. The avx implementation here is the same, but with + // 256-bit vectors instead of 128-bit vectors. + + // This routine is called whenever a match is detected. It is specifically + // marked as unlineable because it improves the codegen of the unrolled + // loop below. Inlining this seems to cause codegen with some extra adds + // and a load that aren't necessary. This seems to result in about a 10% + // improvement for the memchr1/crate/huge/never benchmark. + // + // Interestingly, I couldn't observe a similar improvement for memrchr. + #[cold] + #[inline(never)] + #[target_feature(enable = "avx2")] + unsafe fn matched( + start_ptr: *const u8, + ptr: *const u8, + eqa: __m256i, + eqb: __m256i, + eqc: __m256i, + eqd: __m256i, + ) -> usize { + let mut at = sub(ptr, start_ptr); + let mask = _mm256_movemask_epi8(eqa); + if mask != 0 { + return at + forward_pos(mask); + } + + at += VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqb); + if mask != 0 { + return at + forward_pos(mask); + } + + at += VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqc); + if mask != 0 { + return at + forward_pos(mask); + } + + at += VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqd); + debug_assert!(mask != 0); + at + forward_pos(mask) + } + + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + // For small haystacks, defer to the SSE2 implementation. Codegen + // suggests this completely avoids touching the AVX vectors. + return sse2::memchr(n1, haystack); + } + + let vn1 = _mm256_set1_epi8(n1 as i8); + let loop_size = cmp::min(LOOP_SIZE, haystack.len()); + if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); + let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); + let eqa = _mm256_cmpeq_epi8(vn1, a); + let eqb = _mm256_cmpeq_epi8(vn1, b); + let eqc = _mm256_cmpeq_epi8(vn1, c); + let eqd = _mm256_cmpeq_epi8(vn1, d); + let or1 = _mm256_or_si256(eqa, eqb); + let or2 = _mm256_or_si256(eqc, eqd); + let or3 = _mm256_or_si256(or1, or2); + + if _mm256_movemask_epi8(or3) != 0 { + return Some(matched(start_ptr, ptr, eqa, eqb, eqc, eqd)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); + + if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search1(start_ptr, end_ptr, ptr, vn1); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + #[cold] + #[inline(never)] + #[target_feature(enable = "avx2")] + unsafe fn matched( + start_ptr: *const u8, + ptr: *const u8, + eqa1: __m256i, + eqa2: __m256i, + eqb1: __m256i, + eqb2: __m256i, + ) -> usize { + let mut at = sub(ptr, start_ptr); + let mask1 = _mm256_movemask_epi8(eqa1); + let mask2 = _mm256_movemask_epi8(eqa2); + if mask1 != 0 || mask2 != 0 { + return at + forward_pos2(mask1, mask2); + } + + at += VECTOR_SIZE; + let mask1 = _mm256_movemask_epi8(eqb1); + let mask2 = _mm256_movemask_epi8(eqb2); + at + forward_pos2(mask1, mask2) + } + + let vn1 = _mm256_set1_epi8(n1 as i8); + let vn2 = _mm256_set1_epi8(n2 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let eqa1 = _mm256_cmpeq_epi8(vn1, a); + let eqb1 = _mm256_cmpeq_epi8(vn1, b); + let eqa2 = _mm256_cmpeq_epi8(vn2, a); + let eqb2 = _mm256_cmpeq_epi8(vn2, b); + let or1 = _mm256_or_si256(eqa1, eqb1); + let or2 = _mm256_or_si256(eqa2, eqb2); + let or3 = _mm256_or_si256(or1, or2); + if _mm256_movemask_epi8(or3) != 0 { + return Some(matched(start_ptr, ptr, eqa1, eqa2, eqb1, eqb2)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option { + #[cold] + #[inline(never)] + #[target_feature(enable = "avx2")] + unsafe fn matched( + start_ptr: *const u8, + ptr: *const u8, + eqa1: __m256i, + eqa2: __m256i, + eqa3: __m256i, + eqb1: __m256i, + eqb2: __m256i, + eqb3: __m256i, + ) -> usize { + let mut at = sub(ptr, start_ptr); + let mask1 = _mm256_movemask_epi8(eqa1); + let mask2 = _mm256_movemask_epi8(eqa2); + let mask3 = _mm256_movemask_epi8(eqa3); + if mask1 != 0 || mask2 != 0 || mask3 != 0 { + return at + forward_pos3(mask1, mask2, mask3); + } + + at += VECTOR_SIZE; + let mask1 = _mm256_movemask_epi8(eqb1); + let mask2 = _mm256_movemask_epi8(eqb2); + let mask3 = _mm256_movemask_epi8(eqb3); + at + forward_pos3(mask1, mask2, mask3) + } + + let vn1 = _mm256_set1_epi8(n1 as i8); + let vn2 = _mm256_set1_epi8(n2 as i8); + let vn3 = _mm256_set1_epi8(n3 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let eqa1 = _mm256_cmpeq_epi8(vn1, a); + let eqb1 = _mm256_cmpeq_epi8(vn1, b); + let eqa2 = _mm256_cmpeq_epi8(vn2, a); + let eqb2 = _mm256_cmpeq_epi8(vn2, b); + let eqa3 = _mm256_cmpeq_epi8(vn3, a); + let eqb3 = _mm256_cmpeq_epi8(vn3, b); + let or1 = _mm256_or_si256(eqa1, eqb1); + let or2 = _mm256_or_si256(eqa2, eqb2); + let or3 = _mm256_or_si256(eqa3, eqb3); + let or4 = _mm256_or_si256(or1, or2); + let or5 = _mm256_or_si256(or3, or4); + if _mm256_movemask_epi8(or5) != 0 { + return Some(matched( + start_ptr, ptr, eqa1, eqa2, eqa3, eqb1, eqb2, eqb3, + )); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + if let Some(i) = + forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) + { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option { + let vn1 = _mm256_set1_epi8(n1 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); + let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); + let eqa = _mm256_cmpeq_epi8(vn1, a); + let eqb = _mm256_cmpeq_epi8(vn1, b); + let eqc = _mm256_cmpeq_epi8(vn1, c); + let eqd = _mm256_cmpeq_epi8(vn1, d); + let or1 = _mm256_or_si256(eqa, eqb); + let or2 = _mm256_or_si256(eqc, eqd); + let or3 = _mm256_or_si256(or1, or2); + if _mm256_movemask_epi8(or3) != 0 { + let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); + let mask = _mm256_movemask_epi8(eqd); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqc); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqb); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm256_movemask_epi8(eqa); + debug_assert!(mask != 0); + return Some(at + reverse_pos(mask)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + let vn1 = _mm256_set1_epi8(n1 as i8); + let vn2 = _mm256_set1_epi8(n2 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 || *ptr == n2 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let eqa1 = _mm256_cmpeq_epi8(vn1, a); + let eqb1 = _mm256_cmpeq_epi8(vn1, b); + let eqa2 = _mm256_cmpeq_epi8(vn2, a); + let eqb2 = _mm256_cmpeq_epi8(vn2, b); + let or1 = _mm256_or_si256(eqa1, eqb1); + let or2 = _mm256_or_si256(eqa2, eqb2); + let or3 = _mm256_or_si256(or1, or2); + if _mm256_movemask_epi8(or3) != 0 { + let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); + let mask1 = _mm256_movemask_epi8(eqb1); + let mask2 = _mm256_movemask_epi8(eqb2); + if mask1 != 0 || mask2 != 0 { + return Some(at + reverse_pos2(mask1, mask2)); + } + + at -= VECTOR_SIZE; + let mask1 = _mm256_movemask_epi8(eqa1); + let mask2 = _mm256_movemask_epi8(eqa2); + return Some(at + reverse_pos2(mask1, mask2)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); + } + None +} + +#[target_feature(enable = "avx2")] +pub unsafe fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option { + let vn1 = _mm256_set1_epi8(n1 as i8); + let vn2 = _mm256_set1_epi8(n2 as i8); + let vn3 = _mm256_set1_epi8(n3 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm256_load_si256(ptr as *const __m256i); + let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); + let eqa1 = _mm256_cmpeq_epi8(vn1, a); + let eqb1 = _mm256_cmpeq_epi8(vn1, b); + let eqa2 = _mm256_cmpeq_epi8(vn2, a); + let eqb2 = _mm256_cmpeq_epi8(vn2, b); + let eqa3 = _mm256_cmpeq_epi8(vn3, a); + let eqb3 = _mm256_cmpeq_epi8(vn3, b); + let or1 = _mm256_or_si256(eqa1, eqb1); + let or2 = _mm256_or_si256(eqa2, eqb2); + let or3 = _mm256_or_si256(eqa3, eqb3); + let or4 = _mm256_or_si256(or1, or2); + let or5 = _mm256_or_si256(or3, or4); + if _mm256_movemask_epi8(or5) != 0 { + let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); + let mask1 = _mm256_movemask_epi8(eqb1); + let mask2 = _mm256_movemask_epi8(eqb2); + let mask3 = _mm256_movemask_epi8(eqb3); + if mask1 != 0 || mask2 != 0 || mask3 != 0 { + return Some(at + reverse_pos3(mask1, mask2, mask3)); + } + + at -= VECTOR_SIZE; + let mask1 = _mm256_movemask_epi8(eqa1); + let mask2 = _mm256_movemask_epi8(eqa2); + let mask3 = _mm256_movemask_epi8(eqa3); + return Some(at + reverse_pos3(mask1, mask2, mask3)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = + reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) + { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); + } + None +} + +#[target_feature(enable = "avx2")] +unsafe fn forward_search1( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1)); + if mask != 0 { + Some(sub(ptr, start_ptr) + forward_pos(mask)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn forward_search2( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, + vn2: __m256i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let eq1 = _mm256_cmpeq_epi8(chunk, vn1); + let eq2 = _mm256_cmpeq_epi8(chunk, vn2); + if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { + let mask1 = _mm256_movemask_epi8(eq1); + let mask2 = _mm256_movemask_epi8(eq2); + Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn forward_search3( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, + vn2: __m256i, + vn3: __m256i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let eq1 = _mm256_cmpeq_epi8(chunk, vn1); + let eq2 = _mm256_cmpeq_epi8(chunk, vn2); + let eq3 = _mm256_cmpeq_epi8(chunk, vn3); + let or = _mm256_or_si256(eq1, eq2); + if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { + let mask1 = _mm256_movemask_epi8(eq1); + let mask2 = _mm256_movemask_epi8(eq2); + let mask3 = _mm256_movemask_epi8(eq3); + Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn reverse_search1( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk)); + if mask != 0 { + Some(sub(ptr, start_ptr) + reverse_pos(mask)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn reverse_search2( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, + vn2: __m256i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let eq1 = _mm256_cmpeq_epi8(chunk, vn1); + let eq2 = _mm256_cmpeq_epi8(chunk, vn2); + if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { + let mask1 = _mm256_movemask_epi8(eq1); + let mask2 = _mm256_movemask_epi8(eq2); + Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) + } else { + None + } +} + +#[target_feature(enable = "avx2")] +unsafe fn reverse_search3( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m256i, + vn2: __m256i, + vn3: __m256i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm256_loadu_si256(ptr as *const __m256i); + let eq1 = _mm256_cmpeq_epi8(chunk, vn1); + let eq2 = _mm256_cmpeq_epi8(chunk, vn2); + let eq3 = _mm256_cmpeq_epi8(chunk, vn3); + let or = _mm256_or_si256(eq1, eq2); + if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { + let mask1 = _mm256_movemask_epi8(eq1); + let mask2 = _mm256_movemask_epi8(eq2); + let mask3 = _mm256_movemask_epi8(eq3); + Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) + } else { + None + } +} + +/// Compute the position of the first matching byte from the given mask. The +/// position returned is always in the range [0, 31]. +/// +/// The mask given is expected to be the result of _mm256_movemask_epi8. +fn forward_pos(mask: i32) -> usize { + // We are dealing with little endian here, where the most significant byte + // is at a higher address. That means the least significant bit that is set + // corresponds to the position of our first matching byte. That position + // corresponds to the number of zeros after the least significant bit. + mask.trailing_zeros() as usize +} + +/// Compute the position of the first matching byte from the given masks. The +/// position returned is always in the range [0, 31]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm256_movemask_epi8, +/// where at least one of the masks is non-zero (i.e., indicates a match). +fn forward_pos2(mask1: i32, mask2: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0); + + forward_pos(mask1 | mask2) +} + +/// Compute the position of the first matching byte from the given masks. The +/// position returned is always in the range [0, 31]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm256_movemask_epi8, +/// where at least one of the masks is non-zero (i.e., indicates a match). +fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); + + forward_pos(mask1 | mask2 | mask3) +} + +/// Compute the position of the last matching byte from the given mask. The +/// position returned is always in the range [0, 31]. +/// +/// The mask given is expected to be the result of _mm256_movemask_epi8. +fn reverse_pos(mask: i32) -> usize { + // We are dealing with little endian here, where the most significant byte + // is at a higher address. That means the most significant bit that is set + // corresponds to the position of our last matching byte. The position from + // the end of the mask is therefore the number of leading zeros in a 32 + // bit integer, and the position from the start of the mask is therefore + // 32 - (leading zeros) - 1. + VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1 +} + +/// Compute the position of the last matching byte from the given masks. The +/// position returned is always in the range [0, 31]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm256_movemask_epi8, +/// where at least one of the masks is non-zero (i.e., indicates a match). +fn reverse_pos2(mask1: i32, mask2: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0); + + reverse_pos(mask1 | mask2) +} + +/// Compute the position of the last matching byte from the given masks. The +/// position returned is always in the range [0, 31]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm256_movemask_epi8, +/// where at least one of the masks is non-zero (i.e., indicates a match). +fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); + + reverse_pos(mask1 | mask2 | mask3) +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/memchr/x86/mod.rs b/third_party/rust/memchr/src/memchr/x86/mod.rs new file mode 100644 index 0000000000..aec35dbff0 --- /dev/null +++ b/third_party/rust/memchr/src/memchr/x86/mod.rs @@ -0,0 +1,148 @@ +use super::fallback; + +// We only use AVX when we can detect at runtime whether it's available, which +// requires std. +#[cfg(feature = "std")] +mod avx; +mod sse2; + +/// This macro employs a gcc-like "ifunc" trick where by upon first calling +/// `memchr` (for example), CPU feature detection will be performed at runtime +/// to determine the best implementation to use. After CPU feature detection +/// is done, we replace `memchr`'s function pointer with the selection. Upon +/// subsequent invocations, the CPU-specific routine is invoked directly, which +/// skips the CPU feature detection and subsequent branch that's required. +/// +/// While this typically doesn't matter for rare occurrences or when used on +/// larger haystacks, `memchr` can be called in tight loops where the overhead +/// of this branch can actually add up *and is measurable*. This trick was +/// necessary to bring this implementation up to glibc's speeds for the 'tiny' +/// benchmarks, for example. +/// +/// At some point, I expect the Rust ecosystem will get a nice macro for doing +/// exactly this, at which point, we can replace our hand-jammed version of it. +/// +/// N.B. The ifunc strategy does prevent function inlining of course, but +/// on modern CPUs, you'll probably end up with the AVX2 implementation, +/// which probably can't be inlined anyway---unless you've compiled your +/// entire program with AVX2 enabled. However, even then, the various memchr +/// implementations aren't exactly small, so inlining might not help anyway! +/// +/// # Safety +/// +/// Callers must ensure that fnty is function pointer type. +#[cfg(feature = "std")] +macro_rules! unsafe_ifunc { + ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ + use std::{mem, sync::atomic::{AtomicPtr, Ordering}}; + + type FnRaw = *mut (); + + static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw); + + fn detect($($needle: u8),+, haystack: &[u8]) -> Option { + let fun = + if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") { + avx::$name as FnRaw + } else if cfg!(memchr_runtime_sse2) { + sse2::$name as FnRaw + } else { + fallback::$name as FnRaw + }; + FN.store(fun as FnRaw, Ordering::Relaxed); + // SAFETY: By virtue of the caller contract, $fnty is a function + // pointer, which is always safe to transmute with a *mut (). + // Also, if 'fun is the AVX routine, then it is guaranteed to be + // supported since we checked the avx2 feature. + unsafe { + mem::transmute::(fun)($($needle),+, haystack) + } + } + + // SAFETY: By virtue of the caller contract, $fnty is a function + // pointer, which is always safe to transmute with a *mut (). Also, if + // 'fun is the AVX routine, then it is guaranteed to be supported since + // we checked the avx2 feature. + unsafe { + let fun = FN.load(Ordering::Relaxed); + mem::transmute::(fun)($($needle),+, $haystack) + } + }} +} + +/// When std isn't available to provide runtime CPU feature detection, or if +/// runtime CPU feature detection has been explicitly disabled, then just +/// call our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 +/// targets, so no CPU feature detection is necessary. +/// +/// # Safety +/// +/// There are no safety requirements for this definition of the macro. It is +/// safe for all inputs since it is restricted to either the fallback routine +/// or the SSE routine, which is always safe to call on x86_64. +#[cfg(not(feature = "std"))] +macro_rules! unsafe_ifunc { + ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ + if cfg!(memchr_runtime_sse2) { + unsafe { sse2::$name($($needle),+, $haystack) } + } else { + fallback::$name($($needle),+, $haystack) + } + }} +} + +#[inline(always)] +pub fn memchr(n1: u8, haystack: &[u8]) -> Option { + unsafe_ifunc!(fn(u8, &[u8]) -> Option, memchr, haystack, n1) +} + +#[inline(always)] +pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + unsafe_ifunc!( + fn(u8, u8, &[u8]) -> Option, + memchr2, + haystack, + n1, + n2 + ) +} + +#[inline(always)] +pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + unsafe_ifunc!( + fn(u8, u8, u8, &[u8]) -> Option, + memchr3, + haystack, + n1, + n2, + n3 + ) +} + +#[inline(always)] +pub fn memrchr(n1: u8, haystack: &[u8]) -> Option { + unsafe_ifunc!(fn(u8, &[u8]) -> Option, memrchr, haystack, n1) +} + +#[inline(always)] +pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + unsafe_ifunc!( + fn(u8, u8, &[u8]) -> Option, + memrchr2, + haystack, + n1, + n2 + ) +} + +#[inline(always)] +pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { + unsafe_ifunc!( + fn(u8, u8, u8, &[u8]) -> Option, + memrchr3, + haystack, + n1, + n2, + n3 + ) +} diff --git a/third_party/rust/memchr/src/memchr/x86/sse2.rs b/third_party/rust/memchr/src/memchr/x86/sse2.rs new file mode 100644 index 0000000000..b7b3a9328a --- /dev/null +++ b/third_party/rust/memchr/src/memchr/x86/sse2.rs @@ -0,0 +1,791 @@ +use core::{arch::x86_64::*, cmp, mem::size_of}; + +const VECTOR_SIZE: usize = size_of::<__m128i>(); +const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; + +// The number of bytes to loop at in one iteration of memchr/memrchr. +const LOOP_SIZE: usize = 4 * VECTOR_SIZE; + +// The number of bytes to loop at in one iteration of memchr2/memrchr2 and +// memchr3/memrchr3. There was no observable difference between 64 and 32 bytes +// in benchmarks. memchr3 in particular only gets a very slight speed up from +// the loop unrolling. +const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; + +#[target_feature(enable = "sse2")] +pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option { + // What follows is a fast SSE2-only algorithm to detect the position of + // `n1` in `haystack` if it exists. From what I know, this is the "classic" + // algorithm. I believe it can be found in places like glibc and Go's + // standard library. It appears to be well known and is elaborated on in + // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html + // + // While this routine is very long, the basic idea is actually very simple + // and can be expressed straight-forwardly in pseudo code: + // + // needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1 + // // Note: shift amount in bytes + // + // while i <= haystack.len() - 16: + // // A 16 byte vector. Each byte in chunk corresponds to a byte in + // // the haystack. + // chunk = haystack[i:i+16] + // // Compare bytes in needle with bytes in chunk. The result is a 16 + // // byte chunk where each byte is 0xFF if the corresponding bytes + // // in needle and chunk were equal, or 0x00 otherwise. + // eqs = cmpeq(needle, chunk) + // // Return a 32 bit integer where the most significant 16 bits + // // are always 0 and the lower 16 bits correspond to whether the + // // most significant bit in the correspond byte in `eqs` is set. + // // In other words, `mask as u16` has bit i set if and only if + // // needle[i] == chunk[i]. + // mask = movemask(eqs) + // + // // Mask is 0 if there is no match, and non-zero otherwise. + // if mask != 0: + // // trailing_zeros tells us the position of the least significant + // // bit that is set. + // return i + trailing_zeros(mask) + // + // // haystack length may not be a multiple of 16, so search the rest. + // while i < haystack.len(): + // if haystack[i] == n1: + // return i + // + // // No match found. + // return NULL + // + // In fact, we could loosely translate the above code to Rust line-for-line + // and it would be a pretty fast algorithm. But, we pull out all the stops + // to go as fast as possible: + // + // 1. We use aligned loads. That is, we do some finagling to make sure our + // primary loop not only proceeds in increments of 16 bytes, but that + // the address of haystack's pointer that we dereference is aligned to + // 16 bytes. 16 is a magic number here because it is the size of SSE2 + // 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.) + // Therefore, to get aligned loads, our pointer's address must be evenly + // divisible by 16. + // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's + // kind of like loop unrolling, but we combine the equality comparisons + // using a vector OR such that we only need to extract a single mask to + // determine whether a match exists or not. If so, then we do some + // book-keeping to determine the precise location but otherwise mush on. + // 3. We use our "chunk" comparison routine in as many places as possible, + // even if it means using unaligned loads. In particular, if haystack + // starts with an unaligned address, then we do an unaligned load to + // search the first 16 bytes. We then start our primary loop at the + // smallest subsequent aligned address, which will actually overlap with + // previously searched bytes. But we're OK with that. We do a similar + // dance at the end of our primary loop. Finally, to avoid a + // byte-at-a-time loop at the end, we do a final 16 byte unaligned load + // that may overlap with a previous load. This is OK because it converts + // a loop into a small number of very fast vector instructions. + // + // The primary downside of this algorithm is that it's effectively + // completely unsafe. Therefore, we have to be super careful to avoid + // undefined behavior: + // + // 1. We use raw pointers everywhere. Not only does dereferencing a pointer + // require the pointer to be valid, but we actually can't even store the + // address of an invalid pointer (unless it's 1 past the end of + // haystack) without sacrificing performance. + // 2. _mm_loadu_si128 is used when you don't care about alignment, and + // _mm_load_si128 is used when you do care. You cannot use the latter + // on unaligned pointers. + // 3. We make liberal use of debug_assert! to check assumptions. + // 4. We make a concerted effort to stick with pointers instead of indices. + // Indices are nicer because there's less to worry about with them (see + // above about pointer offsets), but I could not get the compiler to + // produce as good of code as what the below produces. In any case, + // pointers are what we really care about here, and alignment is + // expressed a bit more naturally with them. + // + // In general, most of the algorithms in this crate have a similar + // structure to what you see below, so this comment applies fairly well to + // all of them. + + let vn1 = _mm_set1_epi8(n1 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); + let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); + let eqa = _mm_cmpeq_epi8(vn1, a); + let eqb = _mm_cmpeq_epi8(vn1, b); + let eqc = _mm_cmpeq_epi8(vn1, c); + let eqd = _mm_cmpeq_epi8(vn1, d); + let or1 = _mm_or_si128(eqa, eqb); + let or2 = _mm_or_si128(eqc, eqd); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr, start_ptr); + let mask = _mm_movemask_epi8(eqa); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqb); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqc); + if mask != 0 { + return Some(at + forward_pos(mask)); + } + + at += VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqd); + debug_assert!(mask != 0); + return Some(at + forward_pos(mask)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); + + if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search1(start_ptr, end_ptr, ptr, vn1); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let eqa1 = _mm_cmpeq_epi8(vn1, a); + let eqb1 = _mm_cmpeq_epi8(vn1, b); + let eqa2 = _mm_cmpeq_epi8(vn2, a); + let eqb2 = _mm_cmpeq_epi8(vn2, b); + let or1 = _mm_or_si128(eqa1, eqb1); + let or2 = _mm_or_si128(eqa2, eqb2); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr, start_ptr); + let mask1 = _mm_movemask_epi8(eqa1); + let mask2 = _mm_movemask_epi8(eqa2); + if mask1 != 0 || mask2 != 0 { + return Some(at + forward_pos2(mask1, mask2)); + } + + at += VECTOR_SIZE; + let mask1 = _mm_movemask_epi8(eqb1); + let mask2 = _mm_movemask_epi8(eqb2); + return Some(at + forward_pos2(mask1, mask2)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let vn3 = _mm_set1_epi8(n3 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + + if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { + return Some(i); + } + + ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); + debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); + while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let eqa1 = _mm_cmpeq_epi8(vn1, a); + let eqb1 = _mm_cmpeq_epi8(vn1, b); + let eqa2 = _mm_cmpeq_epi8(vn2, a); + let eqb2 = _mm_cmpeq_epi8(vn2, b); + let eqa3 = _mm_cmpeq_epi8(vn3, a); + let eqb3 = _mm_cmpeq_epi8(vn3, b); + let or1 = _mm_or_si128(eqa1, eqb1); + let or2 = _mm_or_si128(eqa2, eqb2); + let or3 = _mm_or_si128(eqa3, eqb3); + let or4 = _mm_or_si128(or1, or2); + let or5 = _mm_or_si128(or3, or4); + if _mm_movemask_epi8(or5) != 0 { + let mut at = sub(ptr, start_ptr); + let mask1 = _mm_movemask_epi8(eqa1); + let mask2 = _mm_movemask_epi8(eqa2); + let mask3 = _mm_movemask_epi8(eqa3); + if mask1 != 0 || mask2 != 0 || mask3 != 0 { + return Some(at + forward_pos3(mask1, mask2, mask3)); + } + + at += VECTOR_SIZE; + let mask1 = _mm_movemask_epi8(eqb1); + let mask2 = _mm_movemask_epi8(eqb2); + let mask3 = _mm_movemask_epi8(eqb3); + return Some(at + forward_pos3(mask1, mask2, mask3)); + } + ptr = ptr.add(loop_size); + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + if let Some(i) = + forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) + { + return Some(i); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option { + let vn1 = _mm_set1_epi8(n1 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); + let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); + let eqa = _mm_cmpeq_epi8(vn1, a); + let eqb = _mm_cmpeq_epi8(vn1, b); + let eqc = _mm_cmpeq_epi8(vn1, c); + let eqd = _mm_cmpeq_epi8(vn1, d); + let or1 = _mm_or_si128(eqa, eqb); + let or2 = _mm_or_si128(eqc, eqd); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); + let mask = _mm_movemask_epi8(eqd); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqc); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqb); + if mask != 0 { + return Some(at + reverse_pos(mask)); + } + + at -= VECTOR_SIZE; + let mask = _mm_movemask_epi8(eqa); + debug_assert!(mask != 0); + return Some(at + reverse_pos(mask)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 || *ptr == n2 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let eqa1 = _mm_cmpeq_epi8(vn1, a); + let eqb1 = _mm_cmpeq_epi8(vn1, b); + let eqa2 = _mm_cmpeq_epi8(vn2, a); + let eqb2 = _mm_cmpeq_epi8(vn2, b); + let or1 = _mm_or_si128(eqa1, eqb1); + let or2 = _mm_or_si128(eqa2, eqb2); + let or3 = _mm_or_si128(or1, or2); + if _mm_movemask_epi8(or3) != 0 { + let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); + let mask1 = _mm_movemask_epi8(eqb1); + let mask2 = _mm_movemask_epi8(eqb2); + if mask1 != 0 || mask2 != 0 { + return Some(at + reverse_pos2(mask1, mask2)); + } + + at -= VECTOR_SIZE; + let mask1 = _mm_movemask_epi8(eqa1); + let mask2 = _mm_movemask_epi8(eqa2); + return Some(at + reverse_pos2(mask1, mask2)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn memrchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let vn3 = _mm_set1_epi8(n3 as i8); + let len = haystack.len(); + let loop_size = cmp::min(LOOP_SIZE2, len); + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let mut ptr = end_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr > start_ptr { + ptr = ptr.offset(-1); + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + } + return None; + } + + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { + return Some(i); + } + + ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; + debug_assert!(start_ptr <= ptr && ptr <= end_ptr); + while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { + debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); + + ptr = ptr.sub(loop_size); + let a = _mm_load_si128(ptr as *const __m128i); + let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); + let eqa1 = _mm_cmpeq_epi8(vn1, a); + let eqb1 = _mm_cmpeq_epi8(vn1, b); + let eqa2 = _mm_cmpeq_epi8(vn2, a); + let eqb2 = _mm_cmpeq_epi8(vn2, b); + let eqa3 = _mm_cmpeq_epi8(vn3, a); + let eqb3 = _mm_cmpeq_epi8(vn3, b); + let or1 = _mm_or_si128(eqa1, eqb1); + let or2 = _mm_or_si128(eqa2, eqb2); + let or3 = _mm_or_si128(eqa3, eqb3); + let or4 = _mm_or_si128(or1, or2); + let or5 = _mm_or_si128(or3, or4); + if _mm_movemask_epi8(or5) != 0 { + let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); + let mask1 = _mm_movemask_epi8(eqb1); + let mask2 = _mm_movemask_epi8(eqb2); + let mask3 = _mm_movemask_epi8(eqb3); + if mask1 != 0 || mask2 != 0 || mask3 != 0 { + return Some(at + reverse_pos3(mask1, mask2, mask3)); + } + + at -= VECTOR_SIZE; + let mask1 = _mm_movemask_epi8(eqa1); + let mask2 = _mm_movemask_epi8(eqa2); + let mask3 = _mm_movemask_epi8(eqa3); + return Some(at + reverse_pos3(mask1, mask2, mask3)); + } + } + while ptr >= start_ptr.add(VECTOR_SIZE) { + ptr = ptr.sub(VECTOR_SIZE); + if let Some(i) = + reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) + { + return Some(i); + } + } + if ptr > start_ptr { + debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); + return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); + } + None +} + +#[target_feature(enable = "sse2")] +pub unsafe fn forward_search1( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1)); + if mask != 0 { + Some(sub(ptr, start_ptr) + forward_pos(mask)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +unsafe fn forward_search2( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, + vn2: __m128i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let eq1 = _mm_cmpeq_epi8(chunk, vn1); + let eq2 = _mm_cmpeq_epi8(chunk, vn2); + if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { + let mask1 = _mm_movemask_epi8(eq1); + let mask2 = _mm_movemask_epi8(eq2); + Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +pub unsafe fn forward_search3( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, + vn2: __m128i, + vn3: __m128i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let eq1 = _mm_cmpeq_epi8(chunk, vn1); + let eq2 = _mm_cmpeq_epi8(chunk, vn2); + let eq3 = _mm_cmpeq_epi8(chunk, vn3); + let or = _mm_or_si128(eq1, eq2); + if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { + let mask1 = _mm_movemask_epi8(eq1); + let mask2 = _mm_movemask_epi8(eq2); + let mask3 = _mm_movemask_epi8(eq3); + Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +unsafe fn reverse_search1( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk)); + if mask != 0 { + Some(sub(ptr, start_ptr) + reverse_pos(mask)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +unsafe fn reverse_search2( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, + vn2: __m128i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let eq1 = _mm_cmpeq_epi8(chunk, vn1); + let eq2 = _mm_cmpeq_epi8(chunk, vn2); + if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { + let mask1 = _mm_movemask_epi8(eq1); + let mask2 = _mm_movemask_epi8(eq2); + Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) + } else { + None + } +} + +#[target_feature(enable = "sse2")] +unsafe fn reverse_search3( + start_ptr: *const u8, + end_ptr: *const u8, + ptr: *const u8, + vn1: __m128i, + vn2: __m128i, + vn3: __m128i, +) -> Option { + debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); + debug_assert!(start_ptr <= ptr); + debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); + + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let eq1 = _mm_cmpeq_epi8(chunk, vn1); + let eq2 = _mm_cmpeq_epi8(chunk, vn2); + let eq3 = _mm_cmpeq_epi8(chunk, vn3); + let or = _mm_or_si128(eq1, eq2); + if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { + let mask1 = _mm_movemask_epi8(eq1); + let mask2 = _mm_movemask_epi8(eq2); + let mask3 = _mm_movemask_epi8(eq3); + Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) + } else { + None + } +} + +/// Compute the position of the first matching byte from the given mask. The +/// position returned is always in the range [0, 15]. +/// +/// The mask given is expected to be the result of _mm_movemask_epi8. +fn forward_pos(mask: i32) -> usize { + // We are dealing with little endian here, where the most significant byte + // is at a higher address. That means the least significant bit that is set + // corresponds to the position of our first matching byte. That position + // corresponds to the number of zeros after the least significant bit. + mask.trailing_zeros() as usize +} + +/// Compute the position of the first matching byte from the given masks. The +/// position returned is always in the range [0, 15]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm_movemask_epi8, where +/// at least one of the masks is non-zero (i.e., indicates a match). +fn forward_pos2(mask1: i32, mask2: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0); + + forward_pos(mask1 | mask2) +} + +/// Compute the position of the first matching byte from the given masks. The +/// position returned is always in the range [0, 15]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm_movemask_epi8, where +/// at least one of the masks is non-zero (i.e., indicates a match). +fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); + + forward_pos(mask1 | mask2 | mask3) +} + +/// Compute the position of the last matching byte from the given mask. The +/// position returned is always in the range [0, 15]. +/// +/// The mask given is expected to be the result of _mm_movemask_epi8. +fn reverse_pos(mask: i32) -> usize { + // We are dealing with little endian here, where the most significant byte + // is at a higher address. That means the most significant bit that is set + // corresponds to the position of our last matching byte. The position from + // the end of the mask is therefore the number of leading zeros in a 16 + // bit integer, and the position from the start of the mask is therefore + // 16 - (leading zeros) - 1. + VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1 +} + +/// Compute the position of the last matching byte from the given masks. The +/// position returned is always in the range [0, 15]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm_movemask_epi8, where +/// at least one of the masks is non-zero (i.e., indicates a match). +fn reverse_pos2(mask1: i32, mask2: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0); + + reverse_pos(mask1 | mask2) +} + +/// Compute the position of the last matching byte from the given masks. The +/// position returned is always in the range [0, 15]. Each mask corresponds to +/// the equality comparison of a single byte. +/// +/// The masks given are expected to be the result of _mm_movemask_epi8, where +/// at least one of the masks is non-zero (i.e., indicates a match). +fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { + debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); + + reverse_pos(mask1 | mask2 | mask3) +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/memchr/x86/sse42.rs b/third_party/rust/memchr/src/memchr/x86/sse42.rs new file mode 100644 index 0000000000..da38e50c2d --- /dev/null +++ b/third_party/rust/memchr/src/memchr/x86/sse42.rs @@ -0,0 +1,72 @@ +// This code is unused. PCMPESTRI is gratuitously slow. I imagine it might +// start winning with a hypothetical memchr4 (or greater). This technique might +// also be good for exposing searches over ranges of bytes, but that departs +// from the standard memchr API, so it's not clear whether we actually want +// that or not. +// +// N.B. PCMPISTRI appears to be about twice as fast as PCMPESTRI, which is kind +// of neat. Unfortunately, UTF-8 strings can contain NUL bytes, which means +// I don't see a way of effectively using PCMPISTRI unless there's some fast +// way to replace zero bytes with a byte that is not not a needle byte. + +use core::{arch::x86_64::*, mem::size_of}; + +use x86::sse2; + +const VECTOR_SIZE: usize = size_of::<__m128i>(); +const CONTROL_ANY: i32 = _SIDD_UBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT; + +#[target_feature(enable = "sse4.2")] +pub unsafe fn memchr3( + n1: u8, + n2: u8, + n3: u8, + haystack: &[u8], +) -> Option { + let vn1 = _mm_set1_epi8(n1 as i8); + let vn2 = _mm_set1_epi8(n2 as i8); + let vn3 = _mm_set1_epi8(n3 as i8); + let vn = _mm_setr_epi8( + n1 as i8, n2 as i8, n3 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ); + let len = haystack.len(); + let start_ptr = haystack.as_ptr(); + let end_ptr = haystack[haystack.len()..].as_ptr(); + let mut ptr = start_ptr; + + if haystack.len() < VECTOR_SIZE { + while ptr < end_ptr { + if *ptr == n1 || *ptr == n2 || *ptr == n3 { + return Some(sub(ptr, start_ptr)); + } + ptr = ptr.offset(1); + } + return None; + } + while ptr <= end_ptr.sub(VECTOR_SIZE) { + let chunk = _mm_loadu_si128(ptr as *const __m128i); + let res = _mm_cmpestri(vn, 3, chunk, 16, CONTROL_ANY); + if res < 16 { + return Some(sub(ptr, start_ptr) + res as usize); + } + ptr = ptr.add(VECTOR_SIZE); + } + if ptr < end_ptr { + debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); + ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); + debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); + + return sse2::forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); + } + None +} + +/// Subtract `b` from `a` and return the difference. `a` should be greater than +/// or equal to `b`. +fn sub(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/memmem/byte_frequencies.rs b/third_party/rust/memchr/src/memmem/byte_frequencies.rs new file mode 100644 index 0000000000..c313b629db --- /dev/null +++ b/third_party/rust/memchr/src/memmem/byte_frequencies.rs @@ -0,0 +1,258 @@ +pub const BYTE_FREQUENCIES: [u8; 256] = [ + 55, // '\x00' + 52, // '\x01' + 51, // '\x02' + 50, // '\x03' + 49, // '\x04' + 48, // '\x05' + 47, // '\x06' + 46, // '\x07' + 45, // '\x08' + 103, // '\t' + 242, // '\n' + 66, // '\x0b' + 67, // '\x0c' + 229, // '\r' + 44, // '\x0e' + 43, // '\x0f' + 42, // '\x10' + 41, // '\x11' + 40, // '\x12' + 39, // '\x13' + 38, // '\x14' + 37, // '\x15' + 36, // '\x16' + 35, // '\x17' + 34, // '\x18' + 33, // '\x19' + 56, // '\x1a' + 32, // '\x1b' + 31, // '\x1c' + 30, // '\x1d' + 29, // '\x1e' + 28, // '\x1f' + 255, // ' ' + 148, // '!' + 164, // '"' + 149, // '#' + 136, // '$' + 160, // '%' + 155, // '&' + 173, // "'" + 221, // '(' + 222, // ')' + 134, // '*' + 122, // '+' + 232, // ',' + 202, // '-' + 215, // '.' + 224, // '/' + 208, // '0' + 220, // '1' + 204, // '2' + 187, // '3' + 183, // '4' + 179, // '5' + 177, // '6' + 168, // '7' + 178, // '8' + 200, // '9' + 226, // ':' + 195, // ';' + 154, // '<' + 184, // '=' + 174, // '>' + 126, // '?' + 120, // '@' + 191, // 'A' + 157, // 'B' + 194, // 'C' + 170, // 'D' + 189, // 'E' + 162, // 'F' + 161, // 'G' + 150, // 'H' + 193, // 'I' + 142, // 'J' + 137, // 'K' + 171, // 'L' + 176, // 'M' + 185, // 'N' + 167, // 'O' + 186, // 'P' + 112, // 'Q' + 175, // 'R' + 192, // 'S' + 188, // 'T' + 156, // 'U' + 140, // 'V' + 143, // 'W' + 123, // 'X' + 133, // 'Y' + 128, // 'Z' + 147, // '[' + 138, // '\\' + 146, // ']' + 114, // '^' + 223, // '_' + 151, // '`' + 249, // 'a' + 216, // 'b' + 238, // 'c' + 236, // 'd' + 253, // 'e' + 227, // 'f' + 218, // 'g' + 230, // 'h' + 247, // 'i' + 135, // 'j' + 180, // 'k' + 241, // 'l' + 233, // 'm' + 246, // 'n' + 244, // 'o' + 231, // 'p' + 139, // 'q' + 245, // 'r' + 243, // 's' + 251, // 't' + 235, // 'u' + 201, // 'v' + 196, // 'w' + 240, // 'x' + 214, // 'y' + 152, // 'z' + 182, // '{' + 205, // '|' + 181, // '}' + 127, // '~' + 27, // '\x7f' + 212, // '\x80' + 211, // '\x81' + 210, // '\x82' + 213, // '\x83' + 228, // '\x84' + 197, // '\x85' + 169, // '\x86' + 159, // '\x87' + 131, // '\x88' + 172, // '\x89' + 105, // '\x8a' + 80, // '\x8b' + 98, // '\x8c' + 96, // '\x8d' + 97, // '\x8e' + 81, // '\x8f' + 207, // '\x90' + 145, // '\x91' + 116, // '\x92' + 115, // '\x93' + 144, // '\x94' + 130, // '\x95' + 153, // '\x96' + 121, // '\x97' + 107, // '\x98' + 132, // '\x99' + 109, // '\x9a' + 110, // '\x9b' + 124, // '\x9c' + 111, // '\x9d' + 82, // '\x9e' + 108, // '\x9f' + 118, // '\xa0' + 141, // '¡' + 113, // '¢' + 129, // '£' + 119, // '¤' + 125, // '¥' + 165, // '¦' + 117, // '§' + 92, // '¨' + 106, // '©' + 83, // 'ª' + 72, // '«' + 99, // '¬' + 93, // '\xad' + 65, // '®' + 79, // '¯' + 166, // '°' + 237, // '±' + 163, // '²' + 199, // '³' + 190, // '´' + 225, // 'µ' + 209, // '¶' + 203, // '·' + 198, // '¸' + 217, // '¹' + 219, // 'º' + 206, // '»' + 234, // '¼' + 248, // '½' + 158, // '¾' + 239, // '¿' + 255, // 'À' + 255, // 'Á' + 255, // 'Â' + 255, // 'Ã' + 255, // 'Ä' + 255, // 'Å' + 255, // 'Æ' + 255, // 'Ç' + 255, // 'È' + 255, // 'É' + 255, // 'Ê' + 255, // 'Ë' + 255, // 'Ì' + 255, // 'Í' + 255, // 'Î' + 255, // 'Ï' + 255, // 'Ð' + 255, // 'Ñ' + 255, // 'Ò' + 255, // 'Ó' + 255, // 'Ô' + 255, // 'Õ' + 255, // 'Ö' + 255, // '×' + 255, // 'Ø' + 255, // 'Ù' + 255, // 'Ú' + 255, // 'Û' + 255, // 'Ü' + 255, // 'Ý' + 255, // 'Þ' + 255, // 'ß' + 255, // 'à' + 255, // 'á' + 255, // 'â' + 255, // 'ã' + 255, // 'ä' + 255, // 'å' + 255, // 'æ' + 255, // 'ç' + 255, // 'è' + 255, // 'é' + 255, // 'ê' + 255, // 'ë' + 255, // 'ì' + 255, // 'í' + 255, // 'î' + 255, // 'ï' + 255, // 'ð' + 255, // 'ñ' + 255, // 'ò' + 255, // 'ó' + 255, // 'ô' + 255, // 'õ' + 255, // 'ö' + 255, // '÷' + 255, // 'ø' + 255, // 'ù' + 255, // 'ú' + 255, // 'û' + 255, // 'ü' + 255, // 'ý' + 255, // 'þ' + 255, // 'ÿ' +]; diff --git a/third_party/rust/memchr/src/memmem/genericsimd.rs b/third_party/rust/memchr/src/memmem/genericsimd.rs new file mode 100644 index 0000000000..28bfdab880 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/genericsimd.rs @@ -0,0 +1,266 @@ +use core::mem::size_of; + +use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo}; + +/// The minimum length of a needle required for this algorithm. The minimum +/// is 2 since a length of 1 should just use memchr and a length of 0 isn't +/// a case handled by this searcher. +pub(crate) const MIN_NEEDLE_LEN: usize = 2; + +/// The maximum length of a needle required for this algorithm. +/// +/// In reality, there is no hard max here. The code below can handle any +/// length needle. (Perhaps that suggests there are missing optimizations.) +/// Instead, this is a heuristic and a bound guaranteeing our linear time +/// complexity. +/// +/// It is a heuristic because when a candidate match is found, memcmp is run. +/// For very large needles with lots of false positives, memcmp can make the +/// code run quite slow. +/// +/// It is a bound because the worst case behavior with memcmp is multiplicative +/// in the size of the needle and haystack, and we want to keep that additive. +/// This bound ensures we still meet that bound theoretically, since it's just +/// a constant. We aren't acting in bad faith here, memcmp on tiny needles +/// is so fast that even in pathological cases (see pathological vector +/// benchmarks), this is still just as fast or faster in practice. +/// +/// This specific number was chosen by tweaking a bit and running benchmarks. +/// The rare-medium-needle, for example, gets about 5% faster by using this +/// algorithm instead of a prefilter-accelerated Two-Way. There's also a +/// theoretical desire to keep this number reasonably low, to mitigate the +/// impact of pathological cases. I did try 64, and some benchmarks got a +/// little better, and others (particularly the pathological ones), got a lot +/// worse. So... 32 it is? +pub(crate) const MAX_NEEDLE_LEN: usize = 32; + +/// The implementation of the forward vector accelerated substring search. +/// +/// This is extremely similar to the prefilter vector module by the same name. +/// The key difference is that this is not a prefilter. Instead, it handles +/// confirming its own matches. The trade off is that this only works with +/// smaller needles. The speed up here is that an inlined memcmp on a tiny +/// needle is very quick, even on pathological inputs. This is much better than +/// combining a prefilter with Two-Way, where using Two-Way to confirm the +/// match has higher latency. +/// +/// So why not use this for all needles? We could, and it would probably work +/// really well on most inputs. But its worst case is multiplicative and we +/// want to guarantee worst case additive time. Some of the benchmarks try to +/// justify this (see the pathological ones). +/// +/// The prefilter variant of this has more comments. Also note that we only +/// implement this for forward searches for now. If you have a compelling use +/// case for accelerated reverse search, please file an issue. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Forward { + rare1i: u8, + rare2i: u8, +} + +impl Forward { + /// Create a new "generic simd" forward searcher. If one could not be + /// created from the given inputs, then None is returned. + pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option { + let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8(); + // If the needle is too short or too long, give up. Also, give up + // if the rare bytes detected are at the same position. (It likely + // suggests a degenerate case, although it should technically not be + // possible.) + if needle.len() < MIN_NEEDLE_LEN + || needle.len() > MAX_NEEDLE_LEN + || rare1i == rare2i + { + return None; + } + Some(Forward { rare1i, rare2i }) + } + + /// Returns the minimum length of haystack that is needed for this searcher + /// to work for a particular vector. Passing a haystack with a length + /// smaller than this will cause `fwd_find` to panic. + #[inline(always)] + pub(crate) fn min_haystack_len(&self) -> usize { + self.rare2i as usize + size_of::() + } +} + +/// Searches the given haystack for the given needle. The needle given should +/// be the same as the needle that this searcher was initialized with. +/// +/// # Panics +/// +/// When the given haystack has a length smaller than `min_haystack_len`. +/// +/// # Safety +/// +/// Since this is meant to be used with vector functions, callers need to +/// specialize this inside of a function with a `target_feature` attribute. +/// Therefore, callers must ensure that whatever target feature is being used +/// supports the vector functions that this function is specialized for. (For +/// the specific vector functions used, see the Vector trait implementations.) +#[inline(always)] +pub(crate) unsafe fn fwd_find( + fwd: &Forward, + haystack: &[u8], + needle: &[u8], +) -> Option { + // It would be nice if we didn't have this check here, since the meta + // searcher should handle it for us. But without this, I don't think we + // guarantee that end_ptr.sub(needle.len()) won't result in UB. We could + // put it as part of the safety contract, but it makes it more complicated + // than necessary. + if haystack.len() < needle.len() { + return None; + } + let min_haystack_len = fwd.min_haystack_len::(); + assert!(haystack.len() >= min_haystack_len, "haystack too small"); + debug_assert!(needle.len() <= haystack.len()); + debug_assert!( + needle.len() >= MIN_NEEDLE_LEN, + "needle must be at least {} bytes", + MIN_NEEDLE_LEN, + ); + debug_assert!( + needle.len() <= MAX_NEEDLE_LEN, + "needle must be at most {} bytes", + MAX_NEEDLE_LEN, + ); + + let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize); + let rare1chunk = V::splat(needle[rare1i]); + let rare2chunk = V::splat(needle[rare2i]); + + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let max_ptr = end_ptr.sub(min_haystack_len); + let mut ptr = start_ptr; + + // N.B. I did experiment with unrolling the loop to deal with size(V) + // bytes at a time and 2*size(V) bytes at a time. The double unroll was + // marginally faster while the quadruple unroll was unambiguously slower. + // In the end, I decided the complexity from unrolling wasn't worth it. I + // used the memmem/krate/prebuilt/huge-en/ benchmarks to compare. + while ptr <= max_ptr { + let m = fwd_find_in_chunk( + fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0, + ); + if let Some(chunki) = m { + return Some(matched(start_ptr, ptr, chunki)); + } + ptr = ptr.add(size_of::()); + } + if ptr < end_ptr { + let remaining = diff(end_ptr, ptr); + debug_assert!( + remaining < min_haystack_len, + "remaining bytes should be smaller than the minimum haystack \ + length of {}, but there are {} bytes remaining", + min_haystack_len, + remaining, + ); + if remaining < needle.len() { + return None; + } + debug_assert!( + max_ptr < ptr, + "after main loop, ptr should have exceeded max_ptr", + ); + let overlap = diff(ptr, max_ptr); + debug_assert!( + overlap > 0, + "overlap ({}) must always be non-zero", + overlap, + ); + debug_assert!( + overlap < size_of::(), + "overlap ({}) cannot possibly be >= than a vector ({})", + overlap, + size_of::(), + ); + // The mask has all of its bits set except for the first N least + // significant bits, where N=overlap. This way, any matches that + // occur in find_in_chunk within the overlap are automatically + // ignored. + let mask = !((1 << overlap) - 1); + ptr = max_ptr; + let m = fwd_find_in_chunk( + fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask, + ); + if let Some(chunki) = m { + return Some(matched(start_ptr, ptr, chunki)); + } + } + None +} + +/// Search for an occurrence of two rare bytes from the needle in the chunk +/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When +/// an occurrence is found, memcmp is run to check if a match occurs at the +/// corresponding position. +/// +/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2 +/// bytes repeated in each 8-bit lane, respectively. +/// +/// mask should have bits set corresponding the positions in the chunk in which +/// matches are considered. This is only used for the last vector load where +/// the beginning of the vector might have overlapped with the last load in +/// the main loop. The mask lets us avoid visiting positions that have already +/// been discarded as matches. +/// +/// # Safety +/// +/// It must be safe to do an unaligned read of size(V) bytes starting at both +/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned +/// loads on ptr up to (end_ptr - needle.len()). +#[inline(always)] +unsafe fn fwd_find_in_chunk( + fwd: &Forward, + needle: &[u8], + ptr: *const u8, + end_ptr: *const u8, + rare1chunk: V, + rare2chunk: V, + mask: u32, +) -> Option { + let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize)); + let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize)); + + let eq0 = chunk0.cmpeq(rare1chunk); + let eq1 = chunk1.cmpeq(rare2chunk); + + let mut match_offsets = eq0.and(eq1).movemask() & mask; + while match_offsets != 0 { + let offset = match_offsets.trailing_zeros() as usize; + let ptr = ptr.add(offset); + if end_ptr.sub(needle.len()) < ptr { + return None; + } + let chunk = core::slice::from_raw_parts(ptr, needle.len()); + if memcmp(needle, chunk) { + return Some(offset); + } + match_offsets &= match_offsets - 1; + } + None +} + +/// Accepts a chunk-relative offset and returns a haystack relative offset +/// after updating the prefilter state. +/// +/// See the same function with the same name in the prefilter variant of this +/// algorithm to learned why it's tagged with inline(never). Even here, where +/// the function is simpler, inlining it leads to poorer codegen. (Although +/// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.) +#[cold] +#[inline(never)] +fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize { + diff(ptr, start_ptr) + chunki +} + +/// Subtract `b` from `a` and return the difference. `a` must be greater than +/// or equal to `b`. +fn diff(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/memmem/mod.rs b/third_party/rust/memchr/src/memmem/mod.rs new file mode 100644 index 0000000000..e1cd1aec76 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/mod.rs @@ -0,0 +1,1321 @@ +/*! +This module provides forward and reverse substring search routines. + +Unlike the standard library's substring search routines, these work on +arbitrary bytes. For all non-empty needles, these routines will report exactly +the same values as the corresponding routines in the standard library. For +the empty needle, the standard library reports matches only at valid UTF-8 +boundaries, where as these routines will report matches at every position. + +Other than being able to work on arbitrary bytes, the primary reason to prefer +these routines over the standard library routines is that these will generally +be faster. In some cases, significantly so. + +# Example: iterating over substring matches + +This example shows how to use [`find_iter`] to find occurrences of a substring +in a haystack. + +``` +use memchr::memmem; + +let haystack = b"foo bar foo baz foo"; + +let mut it = memmem::find_iter(haystack, "foo"); +assert_eq!(Some(0), it.next()); +assert_eq!(Some(8), it.next()); +assert_eq!(Some(16), it.next()); +assert_eq!(None, it.next()); +``` + +# Example: iterating over substring matches in reverse + +This example shows how to use [`rfind_iter`] to find occurrences of a substring +in a haystack starting from the end of the haystack. + +**NOTE:** This module does not implement double ended iterators, so reverse +searches aren't done by calling `rev` on a forward iterator. + +``` +use memchr::memmem; + +let haystack = b"foo bar foo baz foo"; + +let mut it = memmem::rfind_iter(haystack, "foo"); +assert_eq!(Some(16), it.next()); +assert_eq!(Some(8), it.next()); +assert_eq!(Some(0), it.next()); +assert_eq!(None, it.next()); +``` + +# Example: repeating a search for the same needle + +It may be possible for the overhead of constructing a substring searcher to be +measurable in some workloads. In cases where the same needle is used to search +many haystacks, it is possible to do construction once and thus to avoid it for +subsequent searches. This can be done with a [`Finder`] (or a [`FinderRev`] for +reverse searches). + +``` +use memchr::memmem; + +let finder = memmem::Finder::new("foo"); + +assert_eq!(Some(4), finder.find(b"baz foo quux")); +assert_eq!(None, finder.find(b"quux baz bar")); +``` +*/ + +pub use self::prefilter::Prefilter; + +use crate::{ + cow::CowBytes, + memmem::{ + prefilter::{Pre, PrefilterFn, PrefilterState}, + rabinkarp::NeedleHash, + rarebytes::RareNeedleBytes, + }, +}; + +/// Defines a suite of quickcheck properties for forward and reverse +/// substring searching. +/// +/// This is defined in this specific spot so that it can be used freely among +/// the different substring search implementations. I couldn't be bothered to +/// fight with the macro-visibility rules enough to figure out how to stuff it +/// somewhere more convenient. +#[cfg(all(test, feature = "std"))] +macro_rules! define_memmem_quickcheck_tests { + ($fwd:expr, $rev:expr) => { + use crate::memmem::proptests; + + quickcheck::quickcheck! { + fn qc_fwd_prefix_is_substring(bs: Vec) -> bool { + proptests::prefix_is_substring(false, &bs, $fwd) + } + + fn qc_fwd_suffix_is_substring(bs: Vec) -> bool { + proptests::suffix_is_substring(false, &bs, $fwd) + } + + fn qc_fwd_matches_naive( + haystack: Vec, + needle: Vec + ) -> bool { + proptests::matches_naive(false, &haystack, &needle, $fwd) + } + + fn qc_rev_prefix_is_substring(bs: Vec) -> bool { + proptests::prefix_is_substring(true, &bs, $rev) + } + + fn qc_rev_suffix_is_substring(bs: Vec) -> bool { + proptests::suffix_is_substring(true, &bs, $rev) + } + + fn qc_rev_matches_naive( + haystack: Vec, + needle: Vec + ) -> bool { + proptests::matches_naive(true, &haystack, &needle, $rev) + } + } + }; +} + +/// Defines a suite of "simple" hand-written tests for a substring +/// implementation. +/// +/// This is defined here for the same reason that +/// define_memmem_quickcheck_tests is defined here. +#[cfg(test)] +macro_rules! define_memmem_simple_tests { + ($fwd:expr, $rev:expr) => { + use crate::memmem::testsimples; + + #[test] + fn simple_forward() { + testsimples::run_search_tests_fwd($fwd); + } + + #[test] + fn simple_reverse() { + testsimples::run_search_tests_rev($rev); + } + }; +} + +mod byte_frequencies; +#[cfg(memchr_runtime_simd)] +mod genericsimd; +mod prefilter; +mod rabinkarp; +mod rarebytes; +mod twoway; +mod util; +#[cfg(memchr_runtime_simd)] +mod vector; +#[cfg(all(memchr_runtime_wasm128))] +mod wasm; +#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] +mod x86; + +/// Returns an iterator over all non-overlapping occurrences of a substring in +/// a haystack. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar foo baz foo"; +/// let mut it = memmem::find_iter(haystack, b"foo"); +/// assert_eq!(Some(0), it.next()); +/// assert_eq!(Some(8), it.next()); +/// assert_eq!(Some(16), it.next()); +/// assert_eq!(None, it.next()); +/// ``` +#[inline] +pub fn find_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( + haystack: &'h [u8], + needle: &'n N, +) -> FindIter<'h, 'n> { + FindIter::new(haystack, Finder::new(needle)) +} + +/// Returns a reverse iterator over all non-overlapping occurrences of a +/// substring in a haystack. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar foo baz foo"; +/// let mut it = memmem::rfind_iter(haystack, b"foo"); +/// assert_eq!(Some(16), it.next()); +/// assert_eq!(Some(8), it.next()); +/// assert_eq!(Some(0), it.next()); +/// assert_eq!(None, it.next()); +/// ``` +#[inline] +pub fn rfind_iter<'h, 'n, N: 'n + ?Sized + AsRef<[u8]>>( + haystack: &'h [u8], + needle: &'n N, +) -> FindRevIter<'h, 'n> { + FindRevIter::new(haystack, FinderRev::new(needle)) +} + +/// Returns the index of the first occurrence of the given needle. +/// +/// Note that if you're are searching for the same needle in many different +/// small haystacks, it may be faster to initialize a [`Finder`] once, +/// and reuse it for each search. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar baz"; +/// assert_eq!(Some(0), memmem::find(haystack, b"foo")); +/// assert_eq!(Some(4), memmem::find(haystack, b"bar")); +/// assert_eq!(None, memmem::find(haystack, b"quux")); +/// ``` +#[inline] +pub fn find(haystack: &[u8], needle: &[u8]) -> Option { + if haystack.len() < 64 { + rabinkarp::find(haystack, needle) + } else { + Finder::new(needle).find(haystack) + } +} + +/// Returns the index of the last occurrence of the given needle. +/// +/// Note that if you're are searching for the same needle in many different +/// small haystacks, it may be faster to initialize a [`FinderRev`] once, +/// and reuse it for each search. +/// +/// # Complexity +/// +/// This routine is guaranteed to have worst case linear time complexity +/// with respect to both the needle and the haystack. That is, this runs +/// in `O(needle.len() + haystack.len())` time. +/// +/// This routine is also guaranteed to have worst case constant space +/// complexity. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use memchr::memmem; +/// +/// let haystack = b"foo bar baz"; +/// assert_eq!(Some(0), memmem::rfind(haystack, b"foo")); +/// assert_eq!(Some(4), memmem::rfind(haystack, b"bar")); +/// assert_eq!(Some(8), memmem::rfind(haystack, b"ba")); +/// assert_eq!(None, memmem::rfind(haystack, b"quux")); +/// ``` +#[inline] +pub fn rfind(haystack: &[u8], needle: &[u8]) -> Option { + if haystack.len() < 64 { + rabinkarp::rfind(haystack, needle) + } else { + FinderRev::new(needle).rfind(haystack) + } +} + +/// An iterator over non-overlapping substring matches. +/// +/// Matches are reported by the byte offset at which they begin. +/// +/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the +/// needle. +#[derive(Debug)] +pub struct FindIter<'h, 'n> { + haystack: &'h [u8], + prestate: PrefilterState, + finder: Finder<'n>, + pos: usize, +} + +impl<'h, 'n> FindIter<'h, 'n> { + #[inline(always)] + pub(crate) fn new( + haystack: &'h [u8], + finder: Finder<'n>, + ) -> FindIter<'h, 'n> { + let prestate = finder.searcher.prefilter_state(); + FindIter { haystack, prestate, finder, pos: 0 } + } + + /// Convert this iterator into its owned variant, such that it no longer + /// borrows the finder and needle. + /// + /// If this is already an owned iterator, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> FindIter<'h, 'static> { + FindIter { + haystack: self.haystack, + prestate: self.prestate, + finder: self.finder.into_owned(), + pos: self.pos, + } + } +} + +impl<'h, 'n> Iterator for FindIter<'h, 'n> { + type Item = usize; + + fn next(&mut self) -> Option { + if self.pos > self.haystack.len() { + return None; + } + let result = self + .finder + .searcher + .find(&mut self.prestate, &self.haystack[self.pos..]); + match result { + None => None, + Some(i) => { + let pos = self.pos + i; + self.pos = pos + core::cmp::max(1, self.finder.needle().len()); + Some(pos) + } + } + } +} + +/// An iterator over non-overlapping substring matches in reverse. +/// +/// Matches are reported by the byte offset at which they begin. +/// +/// `'h` is the lifetime of the haystack while `'n` is the lifetime of the +/// needle. +#[derive(Debug)] +pub struct FindRevIter<'h, 'n> { + haystack: &'h [u8], + finder: FinderRev<'n>, + /// When searching with an empty needle, this gets set to `None` after + /// we've yielded the last element at `0`. + pos: Option, +} + +impl<'h, 'n> FindRevIter<'h, 'n> { + #[inline(always)] + pub(crate) fn new( + haystack: &'h [u8], + finder: FinderRev<'n>, + ) -> FindRevIter<'h, 'n> { + let pos = Some(haystack.len()); + FindRevIter { haystack, finder, pos } + } + + /// Convert this iterator into its owned variant, such that it no longer + /// borrows the finder and needle. + /// + /// If this is already an owned iterator, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> FindRevIter<'h, 'static> { + FindRevIter { + haystack: self.haystack, + finder: self.finder.into_owned(), + pos: self.pos, + } + } +} + +impl<'h, 'n> Iterator for FindRevIter<'h, 'n> { + type Item = usize; + + fn next(&mut self) -> Option { + let pos = match self.pos { + None => return None, + Some(pos) => pos, + }; + let result = self.finder.rfind(&self.haystack[..pos]); + match result { + None => None, + Some(i) => { + if pos == i { + self.pos = pos.checked_sub(1); + } else { + self.pos = Some(i); + } + Some(i) + } + } + } +} + +/// A single substring searcher fixed to a particular needle. +/// +/// The purpose of this type is to permit callers to construct a substring +/// searcher that can be used to search haystacks without the overhead of +/// constructing the searcher in the first place. This is a somewhat niche +/// concern when it's necessary to re-use the same needle to search multiple +/// different haystacks with as little overhead as possible. In general, using +/// [`find`] is good enough, but `Finder` is useful when you can meaningfully +/// observe searcher construction time in a profile. +/// +/// When the `std` feature is enabled, then this type has an `into_owned` +/// version which permits building a `Finder` that is not connected to +/// the lifetime of its needle. +#[derive(Clone, Debug)] +pub struct Finder<'n> { + searcher: Searcher<'n>, +} + +impl<'n> Finder<'n> { + /// Create a new finder for the given needle. + #[inline] + pub fn new>(needle: &'n B) -> Finder<'n> { + FinderBuilder::new().build_forward(needle) + } + + /// Returns the index of the first occurrence of this needle in the given + /// haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::Finder; + /// + /// let haystack = b"foo bar baz"; + /// assert_eq!(Some(0), Finder::new("foo").find(haystack)); + /// assert_eq!(Some(4), Finder::new("bar").find(haystack)); + /// assert_eq!(None, Finder::new("quux").find(haystack)); + /// ``` + pub fn find(&self, haystack: &[u8]) -> Option { + self.searcher.find(&mut self.searcher.prefilter_state(), haystack) + } + + /// Returns an iterator over all occurrences of a substring in a haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::Finder; + /// + /// let haystack = b"foo bar foo baz foo"; + /// let finder = Finder::new(b"foo"); + /// let mut it = finder.find_iter(haystack); + /// assert_eq!(Some(0), it.next()); + /// assert_eq!(Some(8), it.next()); + /// assert_eq!(Some(16), it.next()); + /// assert_eq!(None, it.next()); + /// ``` + #[inline] + pub fn find_iter<'a, 'h>( + &'a self, + haystack: &'h [u8], + ) -> FindIter<'h, 'a> { + FindIter::new(haystack, self.as_ref()) + } + + /// Convert this finder into its owned variant, such that it no longer + /// borrows the needle. + /// + /// If this is already an owned finder, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> Finder<'static> { + Finder { searcher: self.searcher.into_owned() } + } + + /// Convert this finder into its borrowed variant. + /// + /// This is primarily useful if your finder is owned and you'd like to + /// store its borrowed variant in some intermediate data structure. + /// + /// Note that the lifetime parameter of the returned finder is tied to the + /// lifetime of `self`, and may be shorter than the `'n` lifetime of the + /// needle itself. Namely, a finder's needle can be either borrowed or + /// owned, so the lifetime of the needle returned must necessarily be the + /// shorter of the two. + #[inline] + pub fn as_ref(&self) -> Finder<'_> { + Finder { searcher: self.searcher.as_ref() } + } + + /// Returns the needle that this finder searches for. + /// + /// Note that the lifetime of the needle returned is tied to the lifetime + /// of the finder, and may be shorter than the `'n` lifetime. Namely, a + /// finder's needle can be either borrowed or owned, so the lifetime of the + /// needle returned must necessarily be the shorter of the two. + #[inline] + pub fn needle(&self) -> &[u8] { + self.searcher.needle() + } +} + +/// A single substring reverse searcher fixed to a particular needle. +/// +/// The purpose of this type is to permit callers to construct a substring +/// searcher that can be used to search haystacks without the overhead of +/// constructing the searcher in the first place. This is a somewhat niche +/// concern when it's necessary to re-use the same needle to search multiple +/// different haystacks with as little overhead as possible. In general, +/// using [`rfind`] is good enough, but `FinderRev` is useful when you can +/// meaningfully observe searcher construction time in a profile. +/// +/// When the `std` feature is enabled, then this type has an `into_owned` +/// version which permits building a `FinderRev` that is not connected to +/// the lifetime of its needle. +#[derive(Clone, Debug)] +pub struct FinderRev<'n> { + searcher: SearcherRev<'n>, +} + +impl<'n> FinderRev<'n> { + /// Create a new reverse finder for the given needle. + #[inline] + pub fn new>(needle: &'n B) -> FinderRev<'n> { + FinderBuilder::new().build_reverse(needle) + } + + /// Returns the index of the last occurrence of this needle in the given + /// haystack. + /// + /// The haystack may be any type that can be cheaply converted into a + /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::FinderRev; + /// + /// let haystack = b"foo bar baz"; + /// assert_eq!(Some(0), FinderRev::new("foo").rfind(haystack)); + /// assert_eq!(Some(4), FinderRev::new("bar").rfind(haystack)); + /// assert_eq!(None, FinderRev::new("quux").rfind(haystack)); + /// ``` + pub fn rfind>(&self, haystack: B) -> Option { + self.searcher.rfind(haystack.as_ref()) + } + + /// Returns a reverse iterator over all occurrences of a substring in a + /// haystack. + /// + /// # Complexity + /// + /// This routine is guaranteed to have worst case linear time complexity + /// with respect to both the needle and the haystack. That is, this runs + /// in `O(needle.len() + haystack.len())` time. + /// + /// This routine is also guaranteed to have worst case constant space + /// complexity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use memchr::memmem::FinderRev; + /// + /// let haystack = b"foo bar foo baz foo"; + /// let finder = FinderRev::new(b"foo"); + /// let mut it = finder.rfind_iter(haystack); + /// assert_eq!(Some(16), it.next()); + /// assert_eq!(Some(8), it.next()); + /// assert_eq!(Some(0), it.next()); + /// assert_eq!(None, it.next()); + /// ``` + #[inline] + pub fn rfind_iter<'a, 'h>( + &'a self, + haystack: &'h [u8], + ) -> FindRevIter<'h, 'a> { + FindRevIter::new(haystack, self.as_ref()) + } + + /// Convert this finder into its owned variant, such that it no longer + /// borrows the needle. + /// + /// If this is already an owned finder, then this is a no-op. Otherwise, + /// this copies the needle. + /// + /// This is only available when the `std` feature is enabled. + #[cfg(feature = "std")] + #[inline] + pub fn into_owned(self) -> FinderRev<'static> { + FinderRev { searcher: self.searcher.into_owned() } + } + + /// Convert this finder into its borrowed variant. + /// + /// This is primarily useful if your finder is owned and you'd like to + /// store its borrowed variant in some intermediate data structure. + /// + /// Note that the lifetime parameter of the returned finder is tied to the + /// lifetime of `self`, and may be shorter than the `'n` lifetime of the + /// needle itself. Namely, a finder's needle can be either borrowed or + /// owned, so the lifetime of the needle returned must necessarily be the + /// shorter of the two. + #[inline] + pub fn as_ref(&self) -> FinderRev<'_> { + FinderRev { searcher: self.searcher.as_ref() } + } + + /// Returns the needle that this finder searches for. + /// + /// Note that the lifetime of the needle returned is tied to the lifetime + /// of the finder, and may be shorter than the `'n` lifetime. Namely, a + /// finder's needle can be either borrowed or owned, so the lifetime of the + /// needle returned must necessarily be the shorter of the two. + #[inline] + pub fn needle(&self) -> &[u8] { + self.searcher.needle() + } +} + +/// A builder for constructing non-default forward or reverse memmem finders. +/// +/// A builder is primarily useful for configuring a substring searcher. +/// Currently, the only configuration exposed is the ability to disable +/// heuristic prefilters used to speed up certain searches. +#[derive(Clone, Debug, Default)] +pub struct FinderBuilder { + config: SearcherConfig, +} + +impl FinderBuilder { + /// Create a new finder builder with default settings. + pub fn new() -> FinderBuilder { + FinderBuilder::default() + } + + /// Build a forward finder using the given needle from the current + /// settings. + pub fn build_forward<'n, B: ?Sized + AsRef<[u8]>>( + &self, + needle: &'n B, + ) -> Finder<'n> { + Finder { searcher: Searcher::new(self.config, needle.as_ref()) } + } + + /// Build a reverse finder using the given needle from the current + /// settings. + pub fn build_reverse<'n, B: ?Sized + AsRef<[u8]>>( + &self, + needle: &'n B, + ) -> FinderRev<'n> { + FinderRev { searcher: SearcherRev::new(needle.as_ref()) } + } + + /// Configure the prefilter setting for the finder. + /// + /// See the documentation for [`Prefilter`] for more discussion on why + /// you might want to configure this. + pub fn prefilter(&mut self, prefilter: Prefilter) -> &mut FinderBuilder { + self.config.prefilter = prefilter; + self + } +} + +/// The internal implementation of a forward substring searcher. +/// +/// The reality is that this is a "meta" searcher. Namely, depending on a +/// variety of parameters (CPU support, target, needle size, haystack size and +/// even dynamic properties such as prefilter effectiveness), the actual +/// algorithm employed to do substring search may change. +#[derive(Clone, Debug)] +struct Searcher<'n> { + /// The actual needle we're searching for. + /// + /// A CowBytes is like a Cow<[u8]>, except in no_std environments, it is + /// specialized to a single variant (the borrowed form). + needle: CowBytes<'n>, + /// A collection of facts computed on the needle that are useful for more + /// than one substring search algorithm. + ninfo: NeedleInfo, + /// A prefilter function, if it was deemed appropriate. + /// + /// Some substring search implementations (like Two-Way) benefit greatly + /// if we can quickly find candidate starting positions for a match. + prefn: Option, + /// The actual substring implementation in use. + kind: SearcherKind, +} + +/// A collection of facts computed about a search needle. +/// +/// We group these things together because it's useful to be able to hand them +/// to prefilters or substring algorithms that want them. +#[derive(Clone, Copy, Debug)] +pub(crate) struct NeedleInfo { + /// The offsets of "rare" bytes detected in the needle. + /// + /// This is meant to be a heuristic in order to maximize the effectiveness + /// of vectorized code. Namely, vectorized code tends to focus on only + /// one or two bytes. If we pick bytes from the needle that occur + /// infrequently, then more time will be spent in the vectorized code and + /// will likely make the overall search (much) faster. + /// + /// Of course, this is only a heuristic based on a background frequency + /// distribution of bytes. But it tends to work very well in practice. + pub(crate) rarebytes: RareNeedleBytes, + /// A Rabin-Karp hash of the needle. + /// + /// This is store here instead of in a more specific Rabin-Karp search + /// since Rabin-Karp may be used even if another SearchKind corresponds + /// to some other search implementation. e.g., If measurements suggest RK + /// is faster in some cases or if a search implementation can't handle + /// particularly small haystack. (Moreover, we cannot use RK *generally*, + /// since its worst case time is multiplicative. Instead, we only use it + /// some small haystacks, where "small" is a constant.) + pub(crate) nhash: NeedleHash, +} + +/// Configuration for substring search. +#[derive(Clone, Copy, Debug, Default)] +struct SearcherConfig { + /// This permits changing the behavior of the prefilter, since it can have + /// a variable impact on performance. + prefilter: Prefilter, +} + +#[derive(Clone, Debug)] +enum SearcherKind { + /// A special case for empty needles. An empty needle always matches, even + /// in an empty haystack. + Empty, + /// This is used whenever the needle is a single byte. In this case, we + /// always use memchr. + OneByte(u8), + /// Two-Way is the generic work horse and is what provides our additive + /// linear time guarantee. In general, it's used when the needle is bigger + /// than 8 bytes or so. + TwoWay(twoway::Forward), + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] + GenericSIMD128(x86::sse::Forward), + #[cfg(memchr_runtime_wasm128)] + GenericSIMD128(wasm::Forward), + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] + GenericSIMD256(x86::avx::Forward), +} + +impl<'n> Searcher<'n> { + fn new(config: SearcherConfig, needle: &'n [u8]) -> Searcher<'n> { + use self::SearcherKind::*; + + let ninfo = NeedleInfo::new(needle); + let mk = |kind: SearcherKind| { + let prefn = prefilter::forward( + &config.prefilter, + &ninfo.rarebytes, + needle, + ); + Searcher { needle: CowBytes::new(needle), ninfo, prefn, kind } + }; + if needle.len() == 0 { + return mk(Empty); + } + if needle.len() == 1 { + return mk(OneByte(needle[0])); + } + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] + { + if let Some(fwd) = x86::avx::Forward::new(&ninfo, needle) { + return mk(GenericSIMD256(fwd)); + } else if let Some(fwd) = x86::sse::Forward::new(&ninfo, needle) { + return mk(GenericSIMD128(fwd)); + } + } + #[cfg(all(target_arch = "wasm32", memchr_runtime_simd))] + { + if let Some(fwd) = wasm::Forward::new(&ninfo, needle) { + return mk(GenericSIMD128(fwd)); + } + } + + mk(TwoWay(twoway::Forward::new(needle))) + } + + /// Return a fresh prefilter state that can be used with this searcher. + /// A prefilter state is used to track the effectiveness of a searcher's + /// prefilter for speeding up searches. Therefore, the prefilter state + /// should generally be reused on subsequent searches (such as in an + /// iterator). For searches on a different haystack, then a new prefilter + /// state should be used. + /// + /// This always initializes a valid (but possibly inert) prefilter state + /// even if this searcher does not have a prefilter enabled. + fn prefilter_state(&self) -> PrefilterState { + if self.prefn.is_none() { + PrefilterState::inert() + } else { + PrefilterState::new() + } + } + + fn needle(&self) -> &[u8] { + self.needle.as_slice() + } + + fn as_ref(&self) -> Searcher<'_> { + use self::SearcherKind::*; + + let kind = match self.kind { + Empty => Empty, + OneByte(b) => OneByte(b), + TwoWay(tw) => TwoWay(tw), + #[cfg(all(not(miri), memchr_runtime_simd))] + GenericSIMD128(gs) => GenericSIMD128(gs), + #[cfg(all( + not(miri), + target_arch = "x86_64", + memchr_runtime_simd + ))] + GenericSIMD256(gs) => GenericSIMD256(gs), + }; + Searcher { + needle: CowBytes::new(self.needle()), + ninfo: self.ninfo, + prefn: self.prefn, + kind, + } + } + + #[cfg(feature = "std")] + fn into_owned(self) -> Searcher<'static> { + use self::SearcherKind::*; + + let kind = match self.kind { + Empty => Empty, + OneByte(b) => OneByte(b), + TwoWay(tw) => TwoWay(tw), + #[cfg(all(not(miri), memchr_runtime_simd))] + GenericSIMD128(gs) => GenericSIMD128(gs), + #[cfg(all( + not(miri), + target_arch = "x86_64", + memchr_runtime_simd + ))] + GenericSIMD256(gs) => GenericSIMD256(gs), + }; + Searcher { + needle: self.needle.into_owned(), + ninfo: self.ninfo, + prefn: self.prefn, + kind, + } + } + + /// Implements forward substring search by selecting the implementation + /// chosen at construction and executing it on the given haystack with the + /// prefilter's current state of effectiveness. + #[inline(always)] + fn find( + &self, + state: &mut PrefilterState, + haystack: &[u8], + ) -> Option { + use self::SearcherKind::*; + + let needle = self.needle(); + if haystack.len() < needle.len() { + return None; + } + match self.kind { + Empty => Some(0), + OneByte(b) => crate::memchr(b, haystack), + TwoWay(ref tw) => { + // For very short haystacks (e.g., where the prefilter probably + // can't run), it's faster to just run RK. + if rabinkarp::is_fast(haystack, needle) { + rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) + } else { + self.find_tw(tw, state, haystack, needle) + } + } + #[cfg(all(not(miri), memchr_runtime_simd))] + GenericSIMD128(ref gs) => { + // The SIMD matcher can't handle particularly short haystacks, + // so we fall back to RK in these cases. + if haystack.len() < gs.min_haystack_len() { + rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) + } else { + gs.find(haystack, needle) + } + } + #[cfg(all( + not(miri), + target_arch = "x86_64", + memchr_runtime_simd + ))] + GenericSIMD256(ref gs) => { + // The SIMD matcher can't handle particularly short haystacks, + // so we fall back to RK in these cases. + if haystack.len() < gs.min_haystack_len() { + rabinkarp::find_with(&self.ninfo.nhash, haystack, needle) + } else { + gs.find(haystack, needle) + } + } + } + } + + /// Calls Two-Way on the given haystack/needle. + /// + /// This is marked as unlineable since it seems to have a better overall + /// effect on benchmarks. However, this is one of those cases where + /// inlining it results an improvement in other benchmarks too, so I + /// suspect we just don't have enough data yet to make the right call here. + /// + /// I suspect the main problem is that this function contains two different + /// inlined copies of Two-Way: one with and one without prefilters enabled. + #[inline(never)] + fn find_tw( + &self, + tw: &twoway::Forward, + state: &mut PrefilterState, + haystack: &[u8], + needle: &[u8], + ) -> Option { + if let Some(prefn) = self.prefn { + // We used to look at the length of a haystack here. That is, if + // it was too small, then don't bother with the prefilter. But two + // things changed: the prefilter falls back to memchr for small + // haystacks, and, above, Rabin-Karp is employed for tiny haystacks + // anyway. + if state.is_effective() { + let mut pre = Pre { state, prefn, ninfo: &self.ninfo }; + return tw.find(Some(&mut pre), haystack, needle); + } + } + tw.find(None, haystack, needle) + } +} + +impl NeedleInfo { + pub(crate) fn new(needle: &[u8]) -> NeedleInfo { + NeedleInfo { + rarebytes: RareNeedleBytes::forward(needle), + nhash: NeedleHash::forward(needle), + } + } +} + +/// The internal implementation of a reverse substring searcher. +/// +/// See the forward searcher docs for more details. Currently, the reverse +/// searcher is considerably simpler since it lacks prefilter support. This +/// was done because it adds a lot of code, and more surface area to test. And +/// in particular, it's not clear whether a prefilter on reverse searching is +/// worth it. (If you have a compelling use case, please file an issue!) +#[derive(Clone, Debug)] +struct SearcherRev<'n> { + /// The actual needle we're searching for. + needle: CowBytes<'n>, + /// A Rabin-Karp hash of the needle. + nhash: NeedleHash, + /// The actual substring implementation in use. + kind: SearcherRevKind, +} + +#[derive(Clone, Debug)] +enum SearcherRevKind { + /// A special case for empty needles. An empty needle always matches, even + /// in an empty haystack. + Empty, + /// This is used whenever the needle is a single byte. In this case, we + /// always use memchr. + OneByte(u8), + /// Two-Way is the generic work horse and is what provides our additive + /// linear time guarantee. In general, it's used when the needle is bigger + /// than 8 bytes or so. + TwoWay(twoway::Reverse), +} + +impl<'n> SearcherRev<'n> { + fn new(needle: &'n [u8]) -> SearcherRev<'n> { + use self::SearcherRevKind::*; + + let kind = if needle.len() == 0 { + Empty + } else if needle.len() == 1 { + OneByte(needle[0]) + } else { + TwoWay(twoway::Reverse::new(needle)) + }; + SearcherRev { + needle: CowBytes::new(needle), + nhash: NeedleHash::reverse(needle), + kind, + } + } + + fn needle(&self) -> &[u8] { + self.needle.as_slice() + } + + fn as_ref(&self) -> SearcherRev<'_> { + use self::SearcherRevKind::*; + + let kind = match self.kind { + Empty => Empty, + OneByte(b) => OneByte(b), + TwoWay(tw) => TwoWay(tw), + }; + SearcherRev { + needle: CowBytes::new(self.needle()), + nhash: self.nhash, + kind, + } + } + + #[cfg(feature = "std")] + fn into_owned(self) -> SearcherRev<'static> { + use self::SearcherRevKind::*; + + let kind = match self.kind { + Empty => Empty, + OneByte(b) => OneByte(b), + TwoWay(tw) => TwoWay(tw), + }; + SearcherRev { + needle: self.needle.into_owned(), + nhash: self.nhash, + kind, + } + } + + /// Implements reverse substring search by selecting the implementation + /// chosen at construction and executing it on the given haystack with the + /// prefilter's current state of effectiveness. + #[inline(always)] + fn rfind(&self, haystack: &[u8]) -> Option { + use self::SearcherRevKind::*; + + let needle = self.needle(); + if haystack.len() < needle.len() { + return None; + } + match self.kind { + Empty => Some(haystack.len()), + OneByte(b) => crate::memrchr(b, haystack), + TwoWay(ref tw) => { + // For very short haystacks (e.g., where the prefilter probably + // can't run), it's faster to just run RK. + if rabinkarp::is_fast(haystack, needle) { + rabinkarp::rfind_with(&self.nhash, haystack, needle) + } else { + tw.rfind(haystack, needle) + } + } + } + } +} + +/// This module defines some generic quickcheck properties useful for testing +/// any substring search algorithm. It also runs those properties for the +/// top-level public API memmem routines. (The properties are also used to +/// test various substring search implementations more granularly elsewhere as +/// well.) +#[cfg(all(test, feature = "std", not(miri)))] +mod proptests { + // N.B. This defines the quickcheck tests using the properties defined + // below. Because of macro-visibility weirdness, the actual macro is + // defined at the top of this file. + define_memmem_quickcheck_tests!(super::find, super::rfind); + + /// Check that every prefix of the given byte string is a substring. + pub(crate) fn prefix_is_substring( + reverse: bool, + bs: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option, + ) -> bool { + if bs.is_empty() { + return true; + } + for i in 0..(bs.len() - 1) { + let prefix = &bs[..i]; + if reverse { + assert_eq!(naive_rfind(bs, prefix), search(bs, prefix)); + } else { + assert_eq!(naive_find(bs, prefix), search(bs, prefix)); + } + } + true + } + + /// Check that every suffix of the given byte string is a substring. + pub(crate) fn suffix_is_substring( + reverse: bool, + bs: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option, + ) -> bool { + if bs.is_empty() { + return true; + } + for i in 0..(bs.len() - 1) { + let suffix = &bs[i..]; + if reverse { + assert_eq!(naive_rfind(bs, suffix), search(bs, suffix)); + } else { + assert_eq!(naive_find(bs, suffix), search(bs, suffix)); + } + } + true + } + + /// Check that naive substring search matches the result of the given search + /// algorithm. + pub(crate) fn matches_naive( + reverse: bool, + haystack: &[u8], + needle: &[u8], + mut search: impl FnMut(&[u8], &[u8]) -> Option, + ) -> bool { + if reverse { + naive_rfind(haystack, needle) == search(haystack, needle) + } else { + naive_find(haystack, needle) == search(haystack, needle) + } + } + + /// Naively search forwards for the given needle in the given haystack. + fn naive_find(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(0); + } else if haystack.len() < needle.len() { + return None; + } + for i in 0..(haystack.len() - needle.len() + 1) { + if needle == &haystack[i..i + needle.len()] { + return Some(i); + } + } + None + } + + /// Naively search in reverse for the given needle in the given haystack. + fn naive_rfind(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(haystack.len()); + } else if haystack.len() < needle.len() { + return None; + } + for i in (0..(haystack.len() - needle.len() + 1)).rev() { + if needle == &haystack[i..i + needle.len()] { + return Some(i); + } + } + None + } +} + +/// This module defines some hand-written "simple" substring tests. It +/// also provides routines for easily running them on any substring search +/// implementation. +#[cfg(test)] +mod testsimples { + define_memmem_simple_tests!(super::find, super::rfind); + + /// Each test is a (needle, haystack, expected_fwd, expected_rev) tuple. + type SearchTest = + (&'static str, &'static str, Option, Option); + + const SEARCH_TESTS: &'static [SearchTest] = &[ + ("", "", Some(0), Some(0)), + ("", "a", Some(0), Some(1)), + ("", "ab", Some(0), Some(2)), + ("", "abc", Some(0), Some(3)), + ("a", "", None, None), + ("a", "a", Some(0), Some(0)), + ("a", "aa", Some(0), Some(1)), + ("a", "ba", Some(1), Some(1)), + ("a", "bba", Some(2), Some(2)), + ("a", "bbba", Some(3), Some(3)), + ("a", "bbbab", Some(3), Some(3)), + ("a", "bbbabb", Some(3), Some(3)), + ("a", "bbbabbb", Some(3), Some(3)), + ("a", "bbbbbb", None, None), + ("ab", "", None, None), + ("ab", "a", None, None), + ("ab", "b", None, None), + ("ab", "ab", Some(0), Some(0)), + ("ab", "aab", Some(1), Some(1)), + ("ab", "aaab", Some(2), Some(2)), + ("ab", "abaab", Some(0), Some(3)), + ("ab", "baaab", Some(3), Some(3)), + ("ab", "acb", None, None), + ("ab", "abba", Some(0), Some(0)), + ("abc", "ab", None, None), + ("abc", "abc", Some(0), Some(0)), + ("abc", "abcz", Some(0), Some(0)), + ("abc", "abczz", Some(0), Some(0)), + ("abc", "zabc", Some(1), Some(1)), + ("abc", "zzabc", Some(2), Some(2)), + ("abc", "azbc", None, None), + ("abc", "abzc", None, None), + ("abczdef", "abczdefzzzzzzzzzzzzzzzzzzzz", Some(0), Some(0)), + ("abczdef", "zzzzzzzzzzzzzzzzzzzzabczdef", Some(20), Some(20)), + ("xyz", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaxyz", Some(32), Some(32)), + // Failures caught by quickcheck. + ("\u{0}\u{15}", "\u{0}\u{15}\u{15}\u{0}", Some(0), Some(0)), + ("\u{0}\u{1e}", "\u{1e}\u{0}", None, None), + ]; + + /// Run the substring search tests. `search` should be a closure that + /// accepts a haystack and a needle and returns the starting position + /// of the first occurrence of needle in the haystack, or `None` if one + /// doesn't exist. + pub(crate) fn run_search_tests_fwd( + mut search: impl FnMut(&[u8], &[u8]) -> Option, + ) { + for &(needle, haystack, expected_fwd, _) in SEARCH_TESTS { + let (n, h) = (needle.as_bytes(), haystack.as_bytes()); + assert_eq!( + expected_fwd, + search(h, n), + "needle: {:?}, haystack: {:?}, expected: {:?}", + n, + h, + expected_fwd + ); + } + } + + /// Run the substring search tests. `search` should be a closure that + /// accepts a haystack and a needle and returns the starting position of + /// the last occurrence of needle in the haystack, or `None` if one doesn't + /// exist. + pub(crate) fn run_search_tests_rev( + mut search: impl FnMut(&[u8], &[u8]) -> Option, + ) { + for &(needle, haystack, _, expected_rev) in SEARCH_TESTS { + let (n, h) = (needle.as_bytes(), haystack.as_bytes()); + assert_eq!( + expected_rev, + search(h, n), + "needle: {:?}, haystack: {:?}, expected: {:?}", + n, + h, + expected_rev + ); + } + } +} diff --git a/third_party/rust/memchr/src/memmem/prefilter/fallback.rs b/third_party/rust/memchr/src/memmem/prefilter/fallback.rs new file mode 100644 index 0000000000..ae1bbccb32 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/prefilter/fallback.rs @@ -0,0 +1,122 @@ +/* +This module implements a "fallback" prefilter that only relies on memchr to +function. While memchr works best when it's explicitly vectorized, its +fallback implementations are fast enough to make a prefilter like this +worthwhile. + +The essence of this implementation is to identify two rare bytes in a needle +based on a background frequency distribution of bytes. We then run memchr on the +rarer byte. For each match, we use the second rare byte as a guard to quickly +check if a match is possible. If the position passes the guard test, then we do +a naive memcmp to confirm the match. + +In practice, this formulation works amazingly well, primarily because of the +heuristic use of a background frequency distribution. However, it does have a +number of weaknesses where it can get quite slow when its background frequency +distribution doesn't line up with the haystack being searched. This is why we +have specialized vector routines that essentially take this idea and move the +guard check into vectorized code. (Those specialized vector routines do still +make use of the background frequency distribution of bytes though.) + +This fallback implementation was originally formulated in regex many moons ago: +https://github.com/rust-lang/regex/blob/3db8722d0b204a85380fe2a65e13d7065d7dd968/src/literal/imp.rs#L370-L501 +Prior to that, I'm not aware of anyone using this technique in any prominent +substring search implementation. Although, I'm sure folks have had this same +insight long before me. + +Another version of this also appeared in bstr: +https://github.com/BurntSushi/bstr/blob/a444256ca7407fe180ee32534688549655b7a38e/src/search/prefilter.rs#L83-L340 +*/ + +use crate::memmem::{ + prefilter::{PrefilterFnTy, PrefilterState}, + NeedleInfo, +}; + +// Check that the functions below satisfy the Prefilter function type. +const _: PrefilterFnTy = find; + +/// Look for a possible occurrence of needle. The position returned +/// corresponds to the beginning of the occurrence, if one exists. +/// +/// Callers may assume that this never returns false negatives (i.e., it +/// never misses an actual occurrence), but must check that the returned +/// position corresponds to a match. That is, it can return false +/// positives. +/// +/// This should only be used when Freqy is constructed for forward +/// searching. +pub(crate) fn find( + prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], +) -> Option { + let mut i = 0; + let (rare1i, rare2i) = ninfo.rarebytes.as_rare_usize(); + let (rare1, rare2) = ninfo.rarebytes.as_rare_bytes(needle); + while prestate.is_effective() { + // Use a fast vectorized implementation to skip to the next + // occurrence of the rarest byte (heuristically chosen) in the + // needle. + let found = crate::memchr(rare1, &haystack[i..])?; + prestate.update(found); + i += found; + + // If we can't align our first match with the haystack, then a + // match is impossible. + if i < rare1i { + i += 1; + continue; + } + + // Align our rare2 byte with the haystack. A mismatch means that + // a match is impossible. + let aligned_rare2i = i - rare1i + rare2i; + if haystack.get(aligned_rare2i) != Some(&rare2) { + i += 1; + continue; + } + + // We've done what we can. There might be a match here. + return Some(i - rare1i); + } + // The only way we get here is if we believe our skipping heuristic + // has become ineffective. We're allowed to return false positives, + // so return the position at which we advanced to, aligned to the + // haystack. + Some(i.saturating_sub(rare1i)) +} + +#[cfg(all(test, feature = "std"))] +mod tests { + use super::*; + + fn freqy_find(haystack: &[u8], needle: &[u8]) -> Option { + let ninfo = NeedleInfo::new(needle); + let mut prestate = PrefilterState::new(); + find(&mut prestate, &ninfo, haystack, needle) + } + + #[test] + fn freqy_forward() { + assert_eq!(Some(0), freqy_find(b"BARFOO", b"BAR")); + assert_eq!(Some(3), freqy_find(b"FOOBAR", b"BAR")); + assert_eq!(Some(0), freqy_find(b"zyzz", b"zyzy")); + assert_eq!(Some(2), freqy_find(b"zzzy", b"zyzy")); + assert_eq!(None, freqy_find(b"zazb", b"zyzy")); + assert_eq!(Some(0), freqy_find(b"yzyy", b"yzyz")); + assert_eq!(Some(2), freqy_find(b"yyyz", b"yzyz")); + assert_eq!(None, freqy_find(b"yayb", b"yzyz")); + } + + #[test] + #[cfg(not(miri))] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + + // SAFETY: super::find is safe to call for all inputs and on all + // platforms. + unsafe { PrefilterTest::run_all_tests(super::find) }; + } +} diff --git a/third_party/rust/memchr/src/memmem/prefilter/genericsimd.rs b/third_party/rust/memchr/src/memmem/prefilter/genericsimd.rs new file mode 100644 index 0000000000..1a6e387348 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/prefilter/genericsimd.rs @@ -0,0 +1,207 @@ +use core::mem::size_of; + +use crate::memmem::{ + prefilter::{PrefilterFnTy, PrefilterState}, + vector::Vector, + NeedleInfo, +}; + +/// The implementation of the forward vector accelerated candidate finder. +/// +/// This is inspired by the "generic SIMD" algorithm described here: +/// http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd +/// +/// The main difference is that this is just a prefilter. That is, it reports +/// candidates once they are seen and doesn't attempt to confirm them. Also, +/// the bytes this routine uses to check for candidates are selected based on +/// an a priori background frequency distribution. This means that on most +/// haystacks, this will on average spend more time in vectorized code than you +/// would if you just selected the first and last bytes of the needle. +/// +/// Note that a non-prefilter variant of this algorithm can be found in the +/// parent module, but it only works on smaller needles. +/// +/// `prestate`, `ninfo`, `haystack` and `needle` are the four prefilter +/// function parameters. `fallback` is a prefilter that is used if the haystack +/// is too small to be handled with the given vector size. +/// +/// This routine is not safe because it is intended for callers to specialize +/// this with a particular vector (e.g., __m256i) and then call it with the +/// relevant target feature (e.g., avx2) enabled. +/// +/// # Panics +/// +/// If `needle.len() <= 1`, then this panics. +/// +/// # Safety +/// +/// Since this is meant to be used with vector functions, callers need to +/// specialize this inside of a function with a `target_feature` attribute. +/// Therefore, callers must ensure that whatever target feature is being used +/// supports the vector functions that this function is specialized for. (For +/// the specific vector functions used, see the Vector trait implementations.) +#[inline(always)] +pub(crate) unsafe fn find( + prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], + fallback: PrefilterFnTy, +) -> Option { + assert!(needle.len() >= 2, "needle must be at least 2 bytes"); + let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_usize(); + let min_haystack_len = rare2i + size_of::(); + if haystack.len() < min_haystack_len { + return fallback(prestate, ninfo, haystack, needle); + } + + let start_ptr = haystack.as_ptr(); + let end_ptr = start_ptr.add(haystack.len()); + let max_ptr = end_ptr.sub(min_haystack_len); + let mut ptr = start_ptr; + + let rare1chunk = V::splat(needle[rare1i]); + let rare2chunk = V::splat(needle[rare2i]); + + // N.B. I did experiment with unrolling the loop to deal with size(V) + // bytes at a time and 2*size(V) bytes at a time. The double unroll + // was marginally faster while the quadruple unroll was unambiguously + // slower. In the end, I decided the complexity from unrolling wasn't + // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to + // compare. + while ptr <= max_ptr { + let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk); + if let Some(chunki) = m { + return Some(matched(prestate, start_ptr, ptr, chunki)); + } + ptr = ptr.add(size_of::()); + } + if ptr < end_ptr { + // This routine immediately quits if a candidate match is found. + // That means that if we're here, no candidate matches have been + // found at or before 'ptr'. Thus, we don't need to mask anything + // out even though we might technically search part of the haystack + // that we've already searched (because we know it can't match). + ptr = max_ptr; + let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk); + if let Some(chunki) = m { + return Some(matched(prestate, start_ptr, ptr, chunki)); + } + } + prestate.update(haystack.len()); + None +} + +// Below are two different techniques for checking whether a candidate +// match exists in a given chunk or not. find_in_chunk2 checks two bytes +// where as find_in_chunk3 checks three bytes. The idea behind checking +// three bytes is that while we do a bit more work per iteration, we +// decrease the chances of a false positive match being reported and thus +// make the search faster overall. This actually works out for the +// memmem/krate/prebuilt/huge-en/never-all-common-bytes benchmark, where +// using find_in_chunk3 is about 25% faster than find_in_chunk2. However, +// it turns out that find_in_chunk2 is faster for all other benchmarks, so +// perhaps the extra check isn't worth it in practice. +// +// For now, we go with find_in_chunk2, but we leave find_in_chunk3 around +// to make it easy to switch to and benchmark when possible. + +/// Search for an occurrence of two rare bytes from the needle in the current +/// chunk pointed to by ptr. +/// +/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2 +/// bytes repeated in each 8-bit lane, respectively. +/// +/// # Safety +/// +/// It must be safe to do an unaligned read of size(V) bytes starting at both +/// (ptr + rare1i) and (ptr + rare2i). +#[inline(always)] +unsafe fn find_in_chunk2( + ptr: *const u8, + rare1i: usize, + rare2i: usize, + rare1chunk: V, + rare2chunk: V, +) -> Option { + let chunk0 = V::load_unaligned(ptr.add(rare1i)); + let chunk1 = V::load_unaligned(ptr.add(rare2i)); + + let eq0 = chunk0.cmpeq(rare1chunk); + let eq1 = chunk1.cmpeq(rare2chunk); + + let match_offsets = eq0.and(eq1).movemask(); + if match_offsets == 0 { + return None; + } + Some(match_offsets.trailing_zeros() as usize) +} + +/// Search for an occurrence of two rare bytes and the first byte (even if one +/// of the rare bytes is equivalent to the first byte) from the needle in the +/// current chunk pointed to by ptr. +/// +/// firstchunk, rare1chunk and rare2chunk correspond to vectors with the first, +/// rare1 and rare2 bytes repeated in each 8-bit lane, respectively. +/// +/// # Safety +/// +/// It must be safe to do an unaligned read of size(V) bytes starting at ptr, +/// (ptr + rare1i) and (ptr + rare2i). +#[allow(dead_code)] +#[inline(always)] +unsafe fn find_in_chunk3( + ptr: *const u8, + rare1i: usize, + rare2i: usize, + firstchunk: V, + rare1chunk: V, + rare2chunk: V, +) -> Option { + let chunk0 = V::load_unaligned(ptr); + let chunk1 = V::load_unaligned(ptr.add(rare1i)); + let chunk2 = V::load_unaligned(ptr.add(rare2i)); + + let eq0 = chunk0.cmpeq(firstchunk); + let eq1 = chunk1.cmpeq(rare1chunk); + let eq2 = chunk2.cmpeq(rare2chunk); + + let match_offsets = eq0.and(eq1).and(eq2).movemask(); + if match_offsets == 0 { + return None; + } + Some(match_offsets.trailing_zeros() as usize) +} + +/// Accepts a chunk-relative offset and returns a haystack relative offset +/// after updating the prefilter state. +/// +/// Why do we use this unlineable function when a search completes? Well, +/// I don't know. Really. Obviously this function was not here initially. +/// When doing profiling, the codegen for the inner loop here looked bad and +/// I didn't know why. There were a couple extra 'add' instructions and an +/// extra 'lea' instruction that I couldn't explain. I hypothesized that the +/// optimizer was having trouble untangling the hot code in the loop from the +/// code that deals with a candidate match. By putting the latter into an +/// unlineable function, it kind of forces the issue and it had the intended +/// effect: codegen improved measurably. It's good for a ~10% improvement +/// across the board on the memmem/krate/prebuilt/huge-en/ benchmarks. +#[cold] +#[inline(never)] +fn matched( + prestate: &mut PrefilterState, + start_ptr: *const u8, + ptr: *const u8, + chunki: usize, +) -> usize { + let found = diff(ptr, start_ptr) + chunki; + prestate.update(found); + found +} + +/// Subtract `b` from `a` and return the difference. `a` must be greater than +/// or equal to `b`. +fn diff(a: *const u8, b: *const u8) -> usize { + debug_assert!(a >= b); + (a as usize) - (b as usize) +} diff --git a/third_party/rust/memchr/src/memmem/prefilter/mod.rs b/third_party/rust/memchr/src/memmem/prefilter/mod.rs new file mode 100644 index 0000000000..015d3b27af --- /dev/null +++ b/third_party/rust/memchr/src/memmem/prefilter/mod.rs @@ -0,0 +1,570 @@ +use crate::memmem::{rarebytes::RareNeedleBytes, NeedleInfo}; + +mod fallback; +#[cfg(memchr_runtime_simd)] +mod genericsimd; +#[cfg(all(not(miri), target_arch = "wasm32", memchr_runtime_simd))] +mod wasm; +#[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] +mod x86; + +/// The maximum frequency rank permitted for the fallback prefilter. If the +/// rarest byte in the needle has a frequency rank above this value, then no +/// prefilter is used if the fallback prefilter would otherwise be selected. +const MAX_FALLBACK_RANK: usize = 250; + +/// A combination of prefilter effectiveness state, the prefilter function and +/// the needle info required to run a prefilter. +/// +/// For the most part, these are grouped into a single type for convenience, +/// instead of needing to pass around all three as distinct function +/// parameters. +pub(crate) struct Pre<'a> { + /// State that tracks the effectiveness of a prefilter. + pub(crate) state: &'a mut PrefilterState, + /// The actual prefilter function. + pub(crate) prefn: PrefilterFn, + /// Information about a needle, such as its RK hash and rare byte offsets. + pub(crate) ninfo: &'a NeedleInfo, +} + +impl<'a> Pre<'a> { + /// Call this prefilter on the given haystack with the given needle. + #[inline(always)] + pub(crate) fn call( + &mut self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + self.prefn.call(self.state, self.ninfo, haystack, needle) + } + + /// Return true if and only if this prefilter should be used. + #[inline(always)] + pub(crate) fn should_call(&mut self) -> bool { + self.state.is_effective() + } +} + +/// A prefilter function. +/// +/// A prefilter function describes both forward and reverse searches. +/// (Although, we don't currently implement prefilters for reverse searching.) +/// In the case of a forward search, the position returned corresponds to +/// the starting offset of a match (confirmed or possible). Its minimum +/// value is `0`, and its maximum value is `haystack.len() - 1`. In the case +/// of a reverse search, the position returned corresponds to the position +/// immediately after a match (confirmed or possible). Its minimum value is `1` +/// and its maximum value is `haystack.len()`. +/// +/// In both cases, the position returned is the starting (or ending) point of a +/// _possible_ match. That is, returning a false positive is okay. A prefilter, +/// however, must never return any false negatives. That is, if a match exists +/// at a particular position `i`, then a prefilter _must_ return that position. +/// It cannot skip past it. +/// +/// # Safety +/// +/// A prefilter function is not safe to create, since not all prefilters are +/// safe to call in all contexts. (e.g., A prefilter that uses AVX instructions +/// may only be called on x86_64 CPUs with the relevant AVX feature enabled.) +/// Thus, callers must ensure that when a prefilter function is created that it +/// is safe to call for the current environment. +#[derive(Clone, Copy)] +pub(crate) struct PrefilterFn(PrefilterFnTy); + +/// The type of a prefilter function. All prefilters must satisfy this +/// signature. +/// +/// Using a function pointer like this does inhibit inlining, but it does +/// eliminate branching and the extra costs associated with copying a larger +/// enum. Note also, that using Box can't really work +/// here, since we want to work in contexts that don't have dynamic memory +/// allocation. Moreover, in the default configuration of this crate on x86_64 +/// CPUs released in the past ~decade, we will use an AVX2-optimized prefilter, +/// which generally won't be inlineable into the surrounding code anyway. +/// (Unless AVX2 is enabled at compile time, but this is typically rare, since +/// it produces a non-portable binary.) +pub(crate) type PrefilterFnTy = unsafe fn( + prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], +) -> Option; + +// If the haystack is too small for SSE2, then just run memchr on the +// rarest byte and be done with it. (It is likely that this code path is +// rarely exercised, since a higher level routine will probably dispatch to +// Rabin-Karp for such a small haystack.) +#[cfg(memchr_runtime_simd)] +fn simple_memchr_fallback( + _prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], +) -> Option { + let (rare, _) = ninfo.rarebytes.as_rare_ordered_usize(); + crate::memchr(needle[rare], haystack).map(|i| i.saturating_sub(rare)) +} + +impl PrefilterFn { + /// Create a new prefilter function from the function pointer given. + /// + /// # Safety + /// + /// Callers must ensure that the given prefilter function is safe to call + /// for all inputs in the current environment. For example, if the given + /// prefilter function uses AVX instructions, then the caller must ensure + /// that the appropriate AVX CPU features are enabled. + pub(crate) unsafe fn new(prefn: PrefilterFnTy) -> PrefilterFn { + PrefilterFn(prefn) + } + + /// Call the underlying prefilter function with the given arguments. + pub fn call( + self, + prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], + ) -> Option { + // SAFETY: Callers have the burden of ensuring that a prefilter + // function is safe to call for all inputs in the current environment. + unsafe { (self.0)(prestate, ninfo, haystack, needle) } + } +} + +impl core::fmt::Debug for PrefilterFn { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + "".fmt(f) + } +} + +/// Prefilter controls whether heuristics are used to accelerate searching. +/// +/// A prefilter refers to the idea of detecting candidate matches very quickly, +/// and then confirming whether those candidates are full matches. This +/// idea can be quite effective since it's often the case that looking for +/// candidates can be a lot faster than running a complete substring search +/// over the entire input. Namely, looking for candidates can be done with +/// extremely fast vectorized code. +/// +/// The downside of a prefilter is that it assumes false positives (which are +/// candidates generated by a prefilter that aren't matches) are somewhat rare +/// relative to the frequency of full matches. That is, if a lot of false +/// positives are generated, then it's possible for search time to be worse +/// than if the prefilter wasn't enabled in the first place. +/// +/// Another downside of a prefilter is that it can result in highly variable +/// performance, where some cases are extraordinarily fast and others aren't. +/// Typically, variable performance isn't a problem, but it may be for your use +/// case. +/// +/// The use of prefilters in this implementation does use a heuristic to detect +/// when a prefilter might not be carrying its weight, and will dynamically +/// disable its use. Nevertheless, this configuration option gives callers +/// the ability to disable prefilters if you have knowledge that they won't be +/// useful. +#[derive(Clone, Copy, Debug)] +#[non_exhaustive] +pub enum Prefilter { + /// Never used a prefilter in substring search. + None, + /// Automatically detect whether a heuristic prefilter should be used. If + /// it is used, then heuristics will be used to dynamically disable the + /// prefilter if it is believed to not be carrying its weight. + Auto, +} + +impl Default for Prefilter { + fn default() -> Prefilter { + Prefilter::Auto + } +} + +impl Prefilter { + pub(crate) fn is_none(&self) -> bool { + match *self { + Prefilter::None => true, + _ => false, + } + } +} + +/// PrefilterState tracks state associated with the effectiveness of a +/// prefilter. It is used to track how many bytes, on average, are skipped by +/// the prefilter. If this average dips below a certain threshold over time, +/// then the state renders the prefilter inert and stops using it. +/// +/// A prefilter state should be created for each search. (Where creating an +/// iterator is treated as a single search.) A prefilter state should only be +/// created from a `Freqy`. e.g., An inert `Freqy` will produce an inert +/// `PrefilterState`. +#[derive(Clone, Debug)] +pub(crate) struct PrefilterState { + /// The number of skips that has been executed. This is always 1 greater + /// than the actual number of skips. The special sentinel value of 0 + /// indicates that the prefilter is inert. This is useful to avoid + /// additional checks to determine whether the prefilter is still + /// "effective." Once a prefilter becomes inert, it should no longer be + /// used (according to our heuristics). + skips: u32, + /// The total number of bytes that have been skipped. + skipped: u32, +} + +impl PrefilterState { + /// The minimum number of skip attempts to try before considering whether + /// a prefilter is effective or not. + const MIN_SKIPS: u32 = 50; + + /// The minimum amount of bytes that skipping must average. + /// + /// This value was chosen based on varying it and checking + /// the microbenchmarks. In particular, this can impact the + /// pathological/repeated-{huge,small} benchmarks quite a bit if it's set + /// too low. + const MIN_SKIP_BYTES: u32 = 8; + + /// Create a fresh prefilter state. + pub(crate) fn new() -> PrefilterState { + PrefilterState { skips: 1, skipped: 0 } + } + + /// Create a fresh prefilter state that is always inert. + pub(crate) fn inert() -> PrefilterState { + PrefilterState { skips: 0, skipped: 0 } + } + + /// Update this state with the number of bytes skipped on the last + /// invocation of the prefilter. + #[inline] + pub(crate) fn update(&mut self, skipped: usize) { + self.skips = self.skips.saturating_add(1); + // We need to do this dance since it's technically possible for + // `skipped` to overflow a `u32`. (And we use a `u32` to reduce the + // size of a prefilter state.) + if skipped > core::u32::MAX as usize { + self.skipped = core::u32::MAX; + } else { + self.skipped = self.skipped.saturating_add(skipped as u32); + } + } + + /// Return true if and only if this state indicates that a prefilter is + /// still effective. + #[inline] + pub(crate) fn is_effective(&mut self) -> bool { + if self.is_inert() { + return false; + } + if self.skips() < PrefilterState::MIN_SKIPS { + return true; + } + if self.skipped >= PrefilterState::MIN_SKIP_BYTES * self.skips() { + return true; + } + + // We're inert. + self.skips = 0; + false + } + + #[inline] + fn is_inert(&self) -> bool { + self.skips == 0 + } + + #[inline] + fn skips(&self) -> u32 { + self.skips.saturating_sub(1) + } +} + +/// Determine which prefilter function, if any, to use. +/// +/// This only applies to x86_64 when runtime SIMD detection is enabled (which +/// is the default). In general, we try to use an AVX prefilter, followed by +/// SSE and then followed by a generic one based on memchr. +#[inline(always)] +pub(crate) fn forward( + config: &Prefilter, + rare: &RareNeedleBytes, + needle: &[u8], +) -> Option { + if config.is_none() || needle.len() <= 1 { + return None; + } + + #[cfg(all(not(miri), target_arch = "x86_64", memchr_runtime_simd))] + { + #[cfg(feature = "std")] + { + if cfg!(memchr_runtime_avx) { + if is_x86_feature_detected!("avx2") { + // SAFETY: x86::avx::find only requires the avx2 feature, + // which we've just checked above. + return unsafe { Some(PrefilterFn::new(x86::avx::find)) }; + } + } + } + if cfg!(memchr_runtime_sse2) { + // SAFETY: x86::sse::find only requires the sse2 feature, which is + // guaranteed to be available on x86_64. + return unsafe { Some(PrefilterFn::new(x86::sse::find)) }; + } + } + #[cfg(all(not(miri), target_arch = "wasm32", memchr_runtime_simd))] + { + // SAFETY: `wasm::find` is actually a safe function + // + // Also note that the `if true` is here to prevent, on wasm with simd, + // rustc warning about the code below being dead code. + if true { + return unsafe { Some(PrefilterFn::new(wasm::find)) }; + } + } + // Check that our rarest byte has a reasonably low rank. The main issue + // here is that the fallback prefilter can perform pretty poorly if it's + // given common bytes. So we try to avoid the worst cases here. + let (rare1_rank, _) = rare.as_ranks(needle); + if rare1_rank <= MAX_FALLBACK_RANK { + // SAFETY: fallback::find is safe to call in all environments. + return unsafe { Some(PrefilterFn::new(fallback::find)) }; + } + None +} + +/// Return the minimum length of the haystack in which a prefilter should be +/// used. If the haystack is below this length, then it's probably not worth +/// the overhead of running the prefilter. +/// +/// We used to look at the length of a haystack here. That is, if it was too +/// small, then don't bother with the prefilter. But two things changed: +/// the prefilter falls back to memchr for small haystacks, and, at the +/// meta-searcher level, Rabin-Karp is employed for tiny haystacks anyway. +/// +/// We keep it around for now in case we want to bring it back. +#[allow(dead_code)] +pub(crate) fn minimum_len(_haystack: &[u8], needle: &[u8]) -> usize { + // If the haystack length isn't greater than needle.len() * FACTOR, then + // no prefilter will be used. The presumption here is that since there + // are so few bytes to check, it's not worth running the prefilter since + // there will need to be a validation step anyway. Thus, the prefilter is + // largely redundant work. + // + // Increase the factor noticeably hurts the + // memmem/krate/prebuilt/teeny-*/never-john-watson benchmarks. + const PREFILTER_LENGTH_FACTOR: usize = 2; + const VECTOR_MIN_LENGTH: usize = 16; + let min = core::cmp::max( + VECTOR_MIN_LENGTH, + PREFILTER_LENGTH_FACTOR * needle.len(), + ); + // For haystacks with length==min, we still want to avoid the prefilter, + // so add 1. + min + 1 +} + +#[cfg(all(test, feature = "std", not(miri)))] +pub(crate) mod tests { + use std::convert::{TryFrom, TryInto}; + + use super::*; + use crate::memmem::{ + prefilter::PrefilterFnTy, rabinkarp, rarebytes::RareNeedleBytes, + }; + + // Below is a small jig that generates prefilter tests. The main purpose + // of this jig is to generate tests of varying needle/haystack lengths + // in order to try and exercise all code paths in our prefilters. And in + // particular, this is especially important for vectorized prefilters where + // certain code paths might only be exercised at certain lengths. + + /// A test that represents the input and expected output to a prefilter + /// function. The test should be able to run with any prefilter function + /// and get the expected output. + pub(crate) struct PrefilterTest { + // These fields represent the inputs and expected output of a forwards + // prefilter function. + pub(crate) ninfo: NeedleInfo, + pub(crate) haystack: Vec, + pub(crate) needle: Vec, + pub(crate) output: Option, + } + + impl PrefilterTest { + /// Run all generated forward prefilter tests on the given prefn. + /// + /// # Safety + /// + /// Callers must ensure that the given prefilter function pointer is + /// safe to call for all inputs in the current environment. + pub(crate) unsafe fn run_all_tests(prefn: PrefilterFnTy) { + PrefilterTest::run_all_tests_filter(prefn, |_| true) + } + + /// Run all generated forward prefilter tests that pass the given + /// predicate on the given prefn. + /// + /// # Safety + /// + /// Callers must ensure that the given prefilter function pointer is + /// safe to call for all inputs in the current environment. + pub(crate) unsafe fn run_all_tests_filter( + prefn: PrefilterFnTy, + mut predicate: impl FnMut(&PrefilterTest) -> bool, + ) { + for seed in PREFILTER_TEST_SEEDS { + for test in seed.generate() { + if predicate(&test) { + test.run(prefn); + } + } + } + } + + /// Create a new prefilter test from a seed and some chose offsets to + /// rare bytes in the seed's needle. + /// + /// If a valid test could not be constructed, then None is returned. + /// (Currently, we take the approach of massaging tests to be valid + /// instead of rejecting them outright.) + fn new( + seed: PrefilterTestSeed, + rare1i: usize, + rare2i: usize, + haystack_len: usize, + needle_len: usize, + output: Option, + ) -> Option { + let mut rare1i: u8 = rare1i.try_into().unwrap(); + let mut rare2i: u8 = rare2i.try_into().unwrap(); + // The '#' byte is never used in a haystack (unless we're expecting + // a match), while the '@' byte is never used in a needle. + let mut haystack = vec![b'@'; haystack_len]; + let mut needle = vec![b'#'; needle_len]; + needle[0] = seed.first; + needle[rare1i as usize] = seed.rare1; + needle[rare2i as usize] = seed.rare2; + // If we're expecting a match, then make sure the needle occurs + // in the haystack at the expected position. + if let Some(i) = output { + haystack[i..i + needle.len()].copy_from_slice(&needle); + } + // If the operations above lead to rare offsets pointing to the + // non-first occurrence of a byte, then adjust it. This might lead + // to redundant tests, but it's simpler than trying to change the + // generation process I think. + if let Some(i) = crate::memchr(seed.rare1, &needle) { + rare1i = u8::try_from(i).unwrap(); + } + if let Some(i) = crate::memchr(seed.rare2, &needle) { + rare2i = u8::try_from(i).unwrap(); + } + let ninfo = NeedleInfo { + rarebytes: RareNeedleBytes::new(rare1i, rare2i), + nhash: rabinkarp::NeedleHash::forward(&needle), + }; + Some(PrefilterTest { ninfo, haystack, needle, output }) + } + + /// Run this specific test on the given prefilter function. If the + /// outputs do no match, then this routine panics with a failure + /// message. + /// + /// # Safety + /// + /// Callers must ensure that the given prefilter function pointer is + /// safe to call for all inputs in the current environment. + unsafe fn run(&self, prefn: PrefilterFnTy) { + let mut prestate = PrefilterState::new(); + assert_eq!( + self.output, + prefn( + &mut prestate, + &self.ninfo, + &self.haystack, + &self.needle + ), + "ninfo: {:?}, haystack(len={}): {:?}, needle(len={}): {:?}", + self.ninfo, + self.haystack.len(), + std::str::from_utf8(&self.haystack).unwrap(), + self.needle.len(), + std::str::from_utf8(&self.needle).unwrap(), + ); + } + } + + /// A set of prefilter test seeds. Each seed serves as the base for the + /// generation of many other tests. In essence, the seed captures the + /// "rare" and first bytes among our needle. The tests generated from each + /// seed essentially vary the length of the needle and haystack, while + /// using the rare/first byte configuration from the seed. + /// + /// The purpose of this is to test many different needle/haystack lengths. + /// In particular, some of the vector optimizations might only have bugs + /// in haystacks of a certain size. + const PREFILTER_TEST_SEEDS: &[PrefilterTestSeed] = &[ + PrefilterTestSeed { first: b'x', rare1: b'y', rare2: b'z' }, + PrefilterTestSeed { first: b'x', rare1: b'x', rare2: b'z' }, + PrefilterTestSeed { first: b'x', rare1: b'y', rare2: b'x' }, + PrefilterTestSeed { first: b'x', rare1: b'x', rare2: b'x' }, + PrefilterTestSeed { first: b'x', rare1: b'y', rare2: b'y' }, + ]; + + /// Data that describes a single prefilter test seed. + #[derive(Clone, Copy)] + struct PrefilterTestSeed { + first: u8, + rare1: u8, + rare2: u8, + } + + impl PrefilterTestSeed { + /// Generate a series of prefilter tests from this seed. + fn generate(self) -> impl Iterator { + let len_start = 2; + // The iterator below generates *a lot* of tests. The number of + // tests was chosen somewhat empirically to be "bearable" when + // running the test suite. + // + // We use an iterator here because the collective haystacks of all + // these test cases add up to enough memory to OOM a conservative + // sandbox or a small laptop. + (len_start..=40).flat_map(move |needle_len| { + let rare_start = len_start - 1; + (rare_start..needle_len).flat_map(move |rare1i| { + (rare1i..needle_len).flat_map(move |rare2i| { + (needle_len..=66).flat_map(move |haystack_len| { + PrefilterTest::new( + self, + rare1i, + rare2i, + haystack_len, + needle_len, + None, + ) + .into_iter() + .chain( + (0..=(haystack_len - needle_len)).flat_map( + move |output| { + PrefilterTest::new( + self, + rare1i, + rare2i, + haystack_len, + needle_len, + Some(output), + ) + }, + ), + ) + }) + }) + }) + }) + } + } +} diff --git a/third_party/rust/memchr/src/memmem/prefilter/wasm.rs b/third_party/rust/memchr/src/memmem/prefilter/wasm.rs new file mode 100644 index 0000000000..5470c922a3 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/prefilter/wasm.rs @@ -0,0 +1,39 @@ +use core::arch::wasm32::v128; + +use crate::memmem::{ + prefilter::{PrefilterFnTy, PrefilterState}, + NeedleInfo, +}; + +// Check that the functions below satisfy the Prefilter function type. +const _: PrefilterFnTy = find; + +/// A `v128`-accelerated candidate finder for single-substring search. +#[target_feature(enable = "simd128")] +pub(crate) fn find( + prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], +) -> Option { + unsafe { + super::genericsimd::find::( + prestate, + ninfo, + haystack, + needle, + super::simple_memchr_fallback, + ) + } +} + +#[cfg(all(test, feature = "std"))] +mod tests { + #[test] + #[cfg(not(miri))] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + // SAFETY: super::find is safe to call for all inputs on x86. + unsafe { PrefilterTest::run_all_tests(super::find) }; + } +} diff --git a/third_party/rust/memchr/src/memmem/prefilter/x86/avx.rs b/third_party/rust/memchr/src/memmem/prefilter/x86/avx.rs new file mode 100644 index 0000000000..fb11f335ba --- /dev/null +++ b/third_party/rust/memchr/src/memmem/prefilter/x86/avx.rs @@ -0,0 +1,46 @@ +use core::arch::x86_64::__m256i; + +use crate::memmem::{ + prefilter::{PrefilterFnTy, PrefilterState}, + NeedleInfo, +}; + +// Check that the functions below satisfy the Prefilter function type. +const _: PrefilterFnTy = find; + +/// An AVX2 accelerated candidate finder for single-substring search. +/// +/// # Safety +/// +/// Callers must ensure that the avx2 CPU feature is enabled in the current +/// environment. +#[target_feature(enable = "avx2")] +pub(crate) unsafe fn find( + prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], +) -> Option { + super::super::genericsimd::find::<__m256i>( + prestate, + ninfo, + haystack, + needle, + super::sse::find, + ) +} + +#[cfg(test)] +mod tests { + #[test] + #[cfg(not(miri))] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + if !is_x86_feature_detected!("avx2") { + return; + } + // SAFETY: The safety of super::find only requires that the current + // CPU support AVX2, which we checked above. + unsafe { PrefilterTest::run_all_tests(super::find) }; + } +} diff --git a/third_party/rust/memchr/src/memmem/prefilter/x86/mod.rs b/third_party/rust/memchr/src/memmem/prefilter/x86/mod.rs new file mode 100644 index 0000000000..91381e5162 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/prefilter/x86/mod.rs @@ -0,0 +1,5 @@ +// We only use AVX when we can detect at runtime whether it's available, which +// requires std. +#[cfg(feature = "std")] +pub(crate) mod avx; +pub(crate) mod sse; diff --git a/third_party/rust/memchr/src/memmem/prefilter/x86/sse.rs b/third_party/rust/memchr/src/memmem/prefilter/x86/sse.rs new file mode 100644 index 0000000000..b1c48e1e1c --- /dev/null +++ b/third_party/rust/memchr/src/memmem/prefilter/x86/sse.rs @@ -0,0 +1,42 @@ +use core::arch::x86_64::__m128i; + +use crate::memmem::{ + prefilter::{PrefilterFnTy, PrefilterState}, + NeedleInfo, +}; + +// Check that the functions below satisfy the Prefilter function type. +const _: PrefilterFnTy = find; + +/// An SSE2 accelerated candidate finder for single-substring search. +/// +/// # Safety +/// +/// Callers must ensure that the sse2 CPU feature is enabled in the current +/// environment. This feature should be enabled in all x86_64 targets. +#[target_feature(enable = "sse2")] +pub(crate) unsafe fn find( + prestate: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], +) -> Option { + super::super::genericsimd::find::<__m128i>( + prestate, + ninfo, + haystack, + needle, + super::super::simple_memchr_fallback, + ) +} + +#[cfg(all(test, feature = "std"))] +mod tests { + #[test] + #[cfg(not(miri))] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + // SAFETY: super::find is safe to call for all inputs on x86. + unsafe { PrefilterTest::run_all_tests(super::find) }; + } +} diff --git a/third_party/rust/memchr/src/memmem/rabinkarp.rs b/third_party/rust/memchr/src/memmem/rabinkarp.rs new file mode 100644 index 0000000000..daa4015d5f --- /dev/null +++ b/third_party/rust/memchr/src/memmem/rabinkarp.rs @@ -0,0 +1,233 @@ +/* +This module implements the classical Rabin-Karp substring search algorithm, +with no extra frills. While its use would seem to break our time complexity +guarantee of O(m+n) (RK's time complexity is O(mn)), we are careful to only +ever use RK on a constant subset of haystacks. The main point here is that +RK has good latency properties for small needles/haystacks. It's very quick +to compute a needle hash and zip through the haystack when compared to +initializing Two-Way, for example. And this is especially useful for cases +where the haystack is just too short for vector instructions to do much good. + +The hashing function used here is the same one recommended by ESMAJ. + +Another choice instead of Rabin-Karp would be Shift-Or. But its latency +isn't quite as good since its preprocessing time is a bit more expensive +(both in practice and in theory). However, perhaps Shift-Or has a place +somewhere else for short patterns. I think the main problem is that it +requires space proportional to the alphabet and the needle. If we, for +example, supported needles up to length 16, then the total table size would be +len(alphabet)*size_of::()==512 bytes. Which isn't exactly small, and it's +probably bad to put that on the stack. So ideally, we'd throw it on the heap, +but we'd really like to write as much code without using alloc/std as possible. +But maybe it's worth the special casing. It's a TODO to benchmark. + +Wikipedia has a decent explanation, if a bit heavy on the theory: +https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm + +But ESMAJ provides something a bit more concrete: +http://www-igm.univ-mlv.fr/~lecroq/string/node5.html + +Finally, aho-corasick uses Rabin-Karp for multiple pattern match in some cases: +https://github.com/BurntSushi/aho-corasick/blob/3852632f10587db0ff72ef29e88d58bf305a0946/src/packed/rabinkarp.rs +*/ + +/// Whether RK is believed to be very fast for the given needle/haystack. +pub(crate) fn is_fast(haystack: &[u8], _needle: &[u8]) -> bool { + haystack.len() < 16 +} + +/// Search for the first occurrence of needle in haystack using Rabin-Karp. +pub(crate) fn find(haystack: &[u8], needle: &[u8]) -> Option { + find_with(&NeedleHash::forward(needle), haystack, needle) +} + +/// Search for the first occurrence of needle in haystack using Rabin-Karp with +/// a pre-computed needle hash. +pub(crate) fn find_with( + nhash: &NeedleHash, + mut haystack: &[u8], + needle: &[u8], +) -> Option { + if haystack.len() < needle.len() { + return None; + } + let start = haystack.as_ptr() as usize; + let mut hash = Hash::from_bytes_fwd(&haystack[..needle.len()]); + // N.B. I've experimented with unrolling this loop, but couldn't realize + // any obvious gains. + loop { + if nhash.eq(hash) && is_prefix(haystack, needle) { + return Some(haystack.as_ptr() as usize - start); + } + if needle.len() >= haystack.len() { + return None; + } + hash.roll(&nhash, haystack[0], haystack[needle.len()]); + haystack = &haystack[1..]; + } +} + +/// Search for the last occurrence of needle in haystack using Rabin-Karp. +pub(crate) fn rfind(haystack: &[u8], needle: &[u8]) -> Option { + rfind_with(&NeedleHash::reverse(needle), haystack, needle) +} + +/// Search for the last occurrence of needle in haystack using Rabin-Karp with +/// a pre-computed needle hash. +pub(crate) fn rfind_with( + nhash: &NeedleHash, + mut haystack: &[u8], + needle: &[u8], +) -> Option { + if haystack.len() < needle.len() { + return None; + } + let mut hash = + Hash::from_bytes_rev(&haystack[haystack.len() - needle.len()..]); + loop { + if nhash.eq(hash) && is_suffix(haystack, needle) { + return Some(haystack.len() - needle.len()); + } + if needle.len() >= haystack.len() { + return None; + } + hash.roll( + &nhash, + haystack[haystack.len() - 1], + haystack[haystack.len() - needle.len() - 1], + ); + haystack = &haystack[..haystack.len() - 1]; + } +} + +/// A hash derived from a needle. +#[derive(Clone, Copy, Debug, Default)] +pub(crate) struct NeedleHash { + /// The actual hash. + hash: Hash, + /// The factor needed to multiply a byte by in order to subtract it from + /// the hash. It is defined to be 2^(n-1) (using wrapping exponentiation), + /// where n is the length of the needle. This is how we "remove" a byte + /// from the hash once the hash window rolls past it. + hash_2pow: u32, +} + +impl NeedleHash { + /// Create a new Rabin-Karp hash for the given needle for use in forward + /// searching. + pub(crate) fn forward(needle: &[u8]) -> NeedleHash { + let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 }; + if needle.is_empty() { + return nh; + } + nh.hash.add(needle[0]); + for &b in needle.iter().skip(1) { + nh.hash.add(b); + nh.hash_2pow = nh.hash_2pow.wrapping_shl(1); + } + nh + } + + /// Create a new Rabin-Karp hash for the given needle for use in reverse + /// searching. + pub(crate) fn reverse(needle: &[u8]) -> NeedleHash { + let mut nh = NeedleHash { hash: Hash::new(), hash_2pow: 1 }; + if needle.is_empty() { + return nh; + } + nh.hash.add(needle[needle.len() - 1]); + for &b in needle.iter().rev().skip(1) { + nh.hash.add(b); + nh.hash_2pow = nh.hash_2pow.wrapping_shl(1); + } + nh + } + + /// Return true if the hashes are equivalent. + fn eq(&self, hash: Hash) -> bool { + self.hash == hash + } +} + +/// A Rabin-Karp hash. This might represent the hash of a needle, or the hash +/// of a rolling window in the haystack. +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub(crate) struct Hash(u32); + +impl Hash { + /// Create a new hash that represents the empty string. + pub(crate) fn new() -> Hash { + Hash(0) + } + + /// Create a new hash from the bytes given for use in forward searches. + pub(crate) fn from_bytes_fwd(bytes: &[u8]) -> Hash { + let mut hash = Hash::new(); + for &b in bytes { + hash.add(b); + } + hash + } + + /// Create a new hash from the bytes given for use in reverse searches. + fn from_bytes_rev(bytes: &[u8]) -> Hash { + let mut hash = Hash::new(); + for &b in bytes.iter().rev() { + hash.add(b); + } + hash + } + + /// Add 'new' and remove 'old' from this hash. The given needle hash should + /// correspond to the hash computed for the needle being searched for. + /// + /// This is meant to be used when the rolling window of the haystack is + /// advanced. + fn roll(&mut self, nhash: &NeedleHash, old: u8, new: u8) { + self.del(nhash, old); + self.add(new); + } + + /// Add a byte to this hash. + fn add(&mut self, byte: u8) { + self.0 = self.0.wrapping_shl(1).wrapping_add(byte as u32); + } + + /// Remove a byte from this hash. The given needle hash should correspond + /// to the hash computed for the needle being searched for. + fn del(&mut self, nhash: &NeedleHash, byte: u8) { + let factor = nhash.hash_2pow; + self.0 = self.0.wrapping_sub((byte as u32).wrapping_mul(factor)); + } +} + +/// Returns true if the given needle is a prefix of the given haystack. +/// +/// We forcefully don't inline the is_prefix call and hint at the compiler that +/// it is unlikely to be called. This causes the inner rabinkarp loop above +/// to be a bit tighter and leads to some performance improvement. See the +/// memmem/krate/prebuilt/sliceslice-words/words benchmark. +#[cold] +#[inline(never)] +fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool { + crate::memmem::util::is_prefix(haystack, needle) +} + +/// Returns true if the given needle is a suffix of the given haystack. +/// +/// See is_prefix for why this is forcefully not inlined. +#[cold] +#[inline(never)] +fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool { + crate::memmem::util::is_suffix(haystack, needle) +} + +#[cfg(test)] +mod simpletests { + define_memmem_simple_tests!(super::find, super::rfind); +} + +#[cfg(all(test, feature = "std", not(miri)))] +mod proptests { + define_memmem_quickcheck_tests!(super::find, super::rfind); +} diff --git a/third_party/rust/memchr/src/memmem/rarebytes.rs b/third_party/rust/memchr/src/memmem/rarebytes.rs new file mode 100644 index 0000000000..fb33f68945 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/rarebytes.rs @@ -0,0 +1,136 @@ +/// A heuristic frequency based detection of rare bytes for substring search. +/// +/// This detector attempts to pick out two bytes in a needle that are predicted +/// to occur least frequently. The purpose is to use these bytes to implement +/// fast candidate search using vectorized code. +/// +/// A set of offsets is only computed for needles of length 2 or greater. +/// Smaller needles should be special cased by the substring search algorithm +/// in use. (e.g., Use memchr for single byte needles.) +/// +/// Note that we use `u8` to represent the offsets of the rare bytes in a +/// needle to reduce space usage. This means that rare byte occurring after the +/// first 255 bytes in a needle will never be used. +#[derive(Clone, Copy, Debug, Default)] +pub(crate) struct RareNeedleBytes { + /// The leftmost offset of the rarest byte in the needle, according to + /// pre-computed frequency analysis. The "leftmost offset" means that + /// rare1i <= i for all i where needle[i] == needle[rare1i]. + rare1i: u8, + /// The leftmost offset of the second rarest byte in the needle, according + /// to pre-computed frequency analysis. The "leftmost offset" means that + /// rare2i <= i for all i where needle[i] == needle[rare2i]. + /// + /// The second rarest byte is used as a type of guard for quickly detecting + /// a mismatch if the first byte matches. This is a hedge against + /// pathological cases where the pre-computed frequency analysis may be + /// off. (But of course, does not prevent *all* pathological cases.) + /// + /// In general, rare1i != rare2i by construction, although there is no hard + /// requirement that they be different. However, since the case of a single + /// byte needle is handled specially by memchr itself, rare2i generally + /// always should be different from rare1i since it would otherwise be + /// ineffective as a guard. + rare2i: u8, +} + +impl RareNeedleBytes { + /// Create a new pair of rare needle bytes with the given offsets. This is + /// only used in tests for generating input data. + #[cfg(all(test, feature = "std"))] + pub(crate) fn new(rare1i: u8, rare2i: u8) -> RareNeedleBytes { + RareNeedleBytes { rare1i, rare2i } + } + + /// Detect the leftmost offsets of the two rarest bytes in the given + /// needle. + pub(crate) fn forward(needle: &[u8]) -> RareNeedleBytes { + if needle.len() <= 1 || needle.len() > core::u8::MAX as usize { + // For needles bigger than u8::MAX, our offsets aren't big enough. + // (We make our offsets small to reduce stack copying.) + // If you have a use case for it, please file an issue. In that + // case, we should probably just adjust the routine below to pick + // some rare bytes from the first 255 bytes of the needle. + // + // Also note that for needles of size 0 or 1, they are special + // cased in Two-Way. + // + // TODO: Benchmar this. + return RareNeedleBytes { rare1i: 0, rare2i: 0 }; + } + + // Find the rarest two bytes. We make them distinct by construction. + let (mut rare1, mut rare1i) = (needle[0], 0); + let (mut rare2, mut rare2i) = (needle[1], 1); + if rank(rare2) < rank(rare1) { + core::mem::swap(&mut rare1, &mut rare2); + core::mem::swap(&mut rare1i, &mut rare2i); + } + for (i, &b) in needle.iter().enumerate().skip(2) { + if rank(b) < rank(rare1) { + rare2 = rare1; + rare2i = rare1i; + rare1 = b; + rare1i = i as u8; + } else if b != rare1 && rank(b) < rank(rare2) { + rare2 = b; + rare2i = i as u8; + } + } + // While not strictly required, we really don't want these to be + // equivalent. If they were, it would reduce the effectiveness of + // candidate searching using these rare bytes by increasing the rate of + // false positives. + assert_ne!(rare1i, rare2i); + RareNeedleBytes { rare1i, rare2i } + } + + /// Return the rare bytes in the given needle in the forward direction. + /// The needle given must be the same one given to the RareNeedleBytes + /// constructor. + pub(crate) fn as_rare_bytes(&self, needle: &[u8]) -> (u8, u8) { + (needle[self.rare1i as usize], needle[self.rare2i as usize]) + } + + /// Return the rare offsets such that the first offset is always <= to the + /// second offset. This is useful when the caller doesn't care whether + /// rare1 is rarer than rare2, but just wants to ensure that they are + /// ordered with respect to one another. + #[cfg(memchr_runtime_simd)] + pub(crate) fn as_rare_ordered_usize(&self) -> (usize, usize) { + let (rare1i, rare2i) = self.as_rare_ordered_u8(); + (rare1i as usize, rare2i as usize) + } + + /// Like as_rare_ordered_usize, but returns the offsets as their native + /// u8 values. + #[cfg(memchr_runtime_simd)] + pub(crate) fn as_rare_ordered_u8(&self) -> (u8, u8) { + if self.rare1i <= self.rare2i { + (self.rare1i, self.rare2i) + } else { + (self.rare2i, self.rare1i) + } + } + + /// Return the rare offsets as usize values in the order in which they were + /// constructed. rare1, for example, is constructed as the "rarer" byte, + /// and thus, callers may want to treat it differently from rare2. + pub(crate) fn as_rare_usize(&self) -> (usize, usize) { + (self.rare1i as usize, self.rare2i as usize) + } + + /// Return the byte frequency rank of each byte. The higher the rank, the + /// more frequency the byte is predicted to be. The needle given must be + /// the same one given to the RareNeedleBytes constructor. + pub(crate) fn as_ranks(&self, needle: &[u8]) -> (usize, usize) { + let (b1, b2) = self.as_rare_bytes(needle); + (rank(b1), rank(b2)) + } +} + +/// Return the heuristical frequency rank of the given byte. A lower rank +/// means the byte is believed to occur less frequently. +fn rank(b: u8) -> usize { + crate::memmem::byte_frequencies::BYTE_FREQUENCIES[b as usize] as usize +} diff --git a/third_party/rust/memchr/src/memmem/twoway.rs b/third_party/rust/memchr/src/memmem/twoway.rs new file mode 100644 index 0000000000..7f82ed15d9 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/twoway.rs @@ -0,0 +1,878 @@ +use core::cmp; + +use crate::memmem::{prefilter::Pre, util}; + +/// Two-Way search in the forward direction. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Forward(TwoWay); + +/// Two-Way search in the reverse direction. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Reverse(TwoWay); + +/// An implementation of the TwoWay substring search algorithm, with heuristics +/// for accelerating search based on frequency analysis. +/// +/// This searcher supports forward and reverse search, although not +/// simultaneously. It runs in O(n + m) time and O(1) space, where +/// `n ~ len(needle)` and `m ~ len(haystack)`. +/// +/// The implementation here roughly matches that which was developed by +/// Crochemore and Perrin in their 1991 paper "Two-way string-matching." The +/// changes in this implementation are 1) the use of zero-based indices, 2) a +/// heuristic skip table based on the last byte (borrowed from Rust's standard +/// library) and 3) the addition of heuristics for a fast skip loop. That is, +/// (3) this will detect bytes that are believed to be rare in the needle and +/// use fast vectorized instructions to find their occurrences quickly. The +/// Two-Way algorithm is then used to confirm whether a match at that location +/// occurred. +/// +/// The heuristic for fast skipping is automatically shut off if it's +/// detected to be ineffective at search time. Generally, this only occurs in +/// pathological cases. But this is generally necessary in order to preserve +/// a `O(n + m)` time bound. +/// +/// The code below is fairly complex and not obviously correct at all. It's +/// likely necessary to read the Two-Way paper cited above in order to fully +/// grok this code. The essence of it is: +/// +/// 1) Do something to detect a "critical" position in the needle. +/// 2) For the current position in the haystack, look if needle[critical..] +/// matches at that position. +/// 3) If so, look if needle[..critical] matches. +/// 4) If a mismatch occurs, shift the search by some amount based on the +/// critical position and a pre-computed shift. +/// +/// This type is wrapped in Forward and Reverse types that expose consistent +/// forward or reverse APIs. +#[derive(Clone, Copy, Debug)] +struct TwoWay { + /// A small bitset used as a quick prefilter (in addition to the faster + /// SIMD based prefilter). Namely, a bit 'i' is set if and only if b%64==i + /// for any b in the needle. + /// + /// When used as a prefilter, if the last byte at the current candidate + /// position is NOT in this set, then we can skip that entire candidate + /// position (the length of the needle). This is essentially the shift + /// trick found in Boyer-Moore, but only applied to bytes that don't appear + /// in the needle. + /// + /// N.B. This trick was inspired by something similar in std's + /// implementation of Two-Way. + byteset: ApproximateByteSet, + /// A critical position in needle. Specifically, this position corresponds + /// to beginning of either the minimal or maximal suffix in needle. (N.B. + /// See SuffixType below for why "minimal" isn't quite the correct word + /// here.) + /// + /// This is the position at which every search begins. Namely, search + /// starts by scanning text to the right of this position, and only if + /// there's a match does the text to the left of this position get scanned. + critical_pos: usize, + /// The amount we shift by in the Two-Way search algorithm. This + /// corresponds to the "small period" and "large period" cases. + shift: Shift, +} + +impl Forward { + /// Create a searcher that uses the Two-Way algorithm by searching forwards + /// through any haystack. + pub(crate) fn new(needle: &[u8]) -> Forward { + if needle.is_empty() { + return Forward(TwoWay::empty()); + } + + let byteset = ApproximateByteSet::new(needle); + let min_suffix = Suffix::forward(needle, SuffixKind::Minimal); + let max_suffix = Suffix::forward(needle, SuffixKind::Maximal); + let (period_lower_bound, critical_pos) = + if min_suffix.pos > max_suffix.pos { + (min_suffix.period, min_suffix.pos) + } else { + (max_suffix.period, max_suffix.pos) + }; + let shift = Shift::forward(needle, period_lower_bound, critical_pos); + Forward(TwoWay { byteset, critical_pos, shift }) + } + + /// Find the position of the first occurrence of this searcher's needle in + /// the given haystack. If one does not exist, then return None. + /// + /// This accepts prefilter state that is useful when using the same + /// searcher multiple times, such as in an iterator. + /// + /// Callers must guarantee that the needle is non-empty and its length is + /// <= the haystack's length. + #[inline(always)] + pub(crate) fn find( + &self, + pre: Option<&mut Pre<'_>>, + haystack: &[u8], + needle: &[u8], + ) -> Option { + debug_assert!(!needle.is_empty(), "needle should not be empty"); + debug_assert!(needle.len() <= haystack.len(), "haystack too short"); + + match self.0.shift { + Shift::Small { period } => { + self.find_small_imp(pre, haystack, needle, period) + } + Shift::Large { shift } => { + self.find_large_imp(pre, haystack, needle, shift) + } + } + } + + /// Like find, but handles the degenerate substring test cases. This is + /// only useful for conveniently testing this substring implementation in + /// isolation. + #[cfg(test)] + fn find_general( + &self, + pre: Option<&mut Pre<'_>>, + haystack: &[u8], + needle: &[u8], + ) -> Option { + if needle.is_empty() { + Some(0) + } else if haystack.len() < needle.len() { + None + } else { + self.find(pre, haystack, needle) + } + } + + // Each of the two search implementations below can be accelerated by a + // prefilter, but it is not always enabled. To avoid its overhead when + // its disabled, we explicitly inline each search implementation based on + // whether a prefilter will be used or not. The decision on which to use + // is made in the parent meta searcher. + + #[inline(always)] + fn find_small_imp( + &self, + mut pre: Option<&mut Pre<'_>>, + haystack: &[u8], + needle: &[u8], + period: usize, + ) -> Option { + let last_byte = needle.len() - 1; + let mut pos = 0; + let mut shift = 0; + while pos + needle.len() <= haystack.len() { + let mut i = cmp::max(self.0.critical_pos, shift); + if let Some(pre) = pre.as_mut() { + if pre.should_call() { + pos += pre.call(&haystack[pos..], needle)?; + shift = 0; + i = self.0.critical_pos; + if pos + needle.len() > haystack.len() { + return None; + } + } + } + if !self.0.byteset.contains(haystack[pos + last_byte]) { + pos += needle.len(); + shift = 0; + continue; + } + while i < needle.len() && needle[i] == haystack[pos + i] { + i += 1; + } + if i < needle.len() { + pos += i - self.0.critical_pos + 1; + shift = 0; + } else { + let mut j = self.0.critical_pos; + while j > shift && needle[j] == haystack[pos + j] { + j -= 1; + } + if j <= shift && needle[shift] == haystack[pos + shift] { + return Some(pos); + } + pos += period; + shift = needle.len() - period; + } + } + None + } + + #[inline(always)] + fn find_large_imp( + &self, + mut pre: Option<&mut Pre<'_>>, + haystack: &[u8], + needle: &[u8], + shift: usize, + ) -> Option { + let last_byte = needle.len() - 1; + let mut pos = 0; + 'outer: while pos + needle.len() <= haystack.len() { + if let Some(pre) = pre.as_mut() { + if pre.should_call() { + pos += pre.call(&haystack[pos..], needle)?; + if pos + needle.len() > haystack.len() { + return None; + } + } + } + + if !self.0.byteset.contains(haystack[pos + last_byte]) { + pos += needle.len(); + continue; + } + let mut i = self.0.critical_pos; + while i < needle.len() && needle[i] == haystack[pos + i] { + i += 1; + } + if i < needle.len() { + pos += i - self.0.critical_pos + 1; + } else { + for j in (0..self.0.critical_pos).rev() { + if needle[j] != haystack[pos + j] { + pos += shift; + continue 'outer; + } + } + return Some(pos); + } + } + None + } +} + +impl Reverse { + /// Create a searcher that uses the Two-Way algorithm by searching in + /// reverse through any haystack. + pub(crate) fn new(needle: &[u8]) -> Reverse { + if needle.is_empty() { + return Reverse(TwoWay::empty()); + } + + let byteset = ApproximateByteSet::new(needle); + let min_suffix = Suffix::reverse(needle, SuffixKind::Minimal); + let max_suffix = Suffix::reverse(needle, SuffixKind::Maximal); + let (period_lower_bound, critical_pos) = + if min_suffix.pos < max_suffix.pos { + (min_suffix.period, min_suffix.pos) + } else { + (max_suffix.period, max_suffix.pos) + }; + // let critical_pos = needle.len() - critical_pos; + let shift = Shift::reverse(needle, period_lower_bound, critical_pos); + Reverse(TwoWay { byteset, critical_pos, shift }) + } + + /// Find the position of the last occurrence of this searcher's needle + /// in the given haystack. If one does not exist, then return None. + /// + /// This will automatically initialize prefilter state. This should only + /// be used for one-off searches. + /// + /// Callers must guarantee that the needle is non-empty and its length is + /// <= the haystack's length. + #[inline(always)] + pub(crate) fn rfind( + &self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + debug_assert!(!needle.is_empty(), "needle should not be empty"); + debug_assert!(needle.len() <= haystack.len(), "haystack too short"); + // For the reverse case, we don't use a prefilter. It's plausible that + // perhaps we should, but it's a lot of additional code to do it, and + // it's not clear that it's actually worth it. If you have a really + // compelling use case for this, please file an issue. + match self.0.shift { + Shift::Small { period } => { + self.rfind_small_imp(haystack, needle, period) + } + Shift::Large { shift } => { + self.rfind_large_imp(haystack, needle, shift) + } + } + } + + /// Like rfind, but handles the degenerate substring test cases. This is + /// only useful for conveniently testing this substring implementation in + /// isolation. + #[cfg(test)] + fn rfind_general(&self, haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + Some(haystack.len()) + } else if haystack.len() < needle.len() { + None + } else { + self.rfind(haystack, needle) + } + } + + #[inline(always)] + fn rfind_small_imp( + &self, + haystack: &[u8], + needle: &[u8], + period: usize, + ) -> Option { + let nlen = needle.len(); + let mut pos = haystack.len(); + let mut shift = nlen; + while pos >= nlen { + if !self.0.byteset.contains(haystack[pos - nlen]) { + pos -= nlen; + shift = nlen; + continue; + } + let mut i = cmp::min(self.0.critical_pos, shift); + while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { + i -= 1; + } + if i > 0 || needle[0] != haystack[pos - nlen] { + pos -= self.0.critical_pos - i + 1; + shift = nlen; + } else { + let mut j = self.0.critical_pos; + while j < shift && needle[j] == haystack[pos - nlen + j] { + j += 1; + } + if j >= shift { + return Some(pos - nlen); + } + pos -= period; + shift = period; + } + } + None + } + + #[inline(always)] + fn rfind_large_imp( + &self, + haystack: &[u8], + needle: &[u8], + shift: usize, + ) -> Option { + let nlen = needle.len(); + let mut pos = haystack.len(); + while pos >= nlen { + if !self.0.byteset.contains(haystack[pos - nlen]) { + pos -= nlen; + continue; + } + let mut i = self.0.critical_pos; + while i > 0 && needle[i - 1] == haystack[pos - nlen + i - 1] { + i -= 1; + } + if i > 0 || needle[0] != haystack[pos - nlen] { + pos -= self.0.critical_pos - i + 1; + } else { + let mut j = self.0.critical_pos; + while j < nlen && needle[j] == haystack[pos - nlen + j] { + j += 1; + } + if j == nlen { + return Some(pos - nlen); + } + pos -= shift; + } + } + None + } +} + +impl TwoWay { + fn empty() -> TwoWay { + TwoWay { + byteset: ApproximateByteSet::new(b""), + critical_pos: 0, + shift: Shift::Large { shift: 0 }, + } + } +} + +/// A representation of the amount we're allowed to shift by during Two-Way +/// search. +/// +/// When computing a critical factorization of the needle, we find the position +/// of the critical factorization by finding the needle's maximal (or minimal) +/// suffix, along with the period of that suffix. It turns out that the period +/// of that suffix is a lower bound on the period of the needle itself. +/// +/// This lower bound is equivalent to the actual period of the needle in +/// some cases. To describe that case, we denote the needle as `x` where +/// `x = uv` and `v` is the lexicographic maximal suffix of `v`. The lower +/// bound given here is always the period of `v`, which is `<= period(x)`. The +/// case where `period(v) == period(x)` occurs when `len(u) < (len(x) / 2)` and +/// where `u` is a suffix of `v[0..period(v)]`. +/// +/// This case is important because the search algorithm for when the +/// periods are equivalent is slightly different than the search algorithm +/// for when the periods are not equivalent. In particular, when they aren't +/// equivalent, we know that the period of the needle is no less than half its +/// length. In this case, we shift by an amount less than or equal to the +/// period of the needle (determined by the maximum length of the components +/// of the critical factorization of `x`, i.e., `max(len(u), len(v))`).. +/// +/// The above two cases are represented by the variants below. Each entails +/// a different instantiation of the Two-Way search algorithm. +/// +/// N.B. If we could find a way to compute the exact period in all cases, +/// then we could collapse this case analysis and simplify the algorithm. The +/// Two-Way paper suggests this is possible, but more reading is required to +/// grok why the authors didn't pursue that path. +#[derive(Clone, Copy, Debug)] +enum Shift { + Small { period: usize }, + Large { shift: usize }, +} + +impl Shift { + /// Compute the shift for a given needle in the forward direction. + /// + /// This requires a lower bound on the period and a critical position. + /// These can be computed by extracting both the minimal and maximal + /// lexicographic suffixes, and choosing the right-most starting position. + /// The lower bound on the period is then the period of the chosen suffix. + fn forward( + needle: &[u8], + period_lower_bound: usize, + critical_pos: usize, + ) -> Shift { + let large = cmp::max(critical_pos, needle.len() - critical_pos); + if critical_pos * 2 >= needle.len() { + return Shift::Large { shift: large }; + } + + let (u, v) = needle.split_at(critical_pos); + if !util::is_suffix(&v[..period_lower_bound], u) { + return Shift::Large { shift: large }; + } + Shift::Small { period: period_lower_bound } + } + + /// Compute the shift for a given needle in the reverse direction. + /// + /// This requires a lower bound on the period and a critical position. + /// These can be computed by extracting both the minimal and maximal + /// lexicographic suffixes, and choosing the left-most starting position. + /// The lower bound on the period is then the period of the chosen suffix. + fn reverse( + needle: &[u8], + period_lower_bound: usize, + critical_pos: usize, + ) -> Shift { + let large = cmp::max(critical_pos, needle.len() - critical_pos); + if (needle.len() - critical_pos) * 2 >= needle.len() { + return Shift::Large { shift: large }; + } + + let (v, u) = needle.split_at(critical_pos); + if !util::is_prefix(&v[v.len() - period_lower_bound..], u) { + return Shift::Large { shift: large }; + } + Shift::Small { period: period_lower_bound } + } +} + +/// A suffix extracted from a needle along with its period. +#[derive(Debug)] +struct Suffix { + /// The starting position of this suffix. + /// + /// If this is a forward suffix, then `&bytes[pos..]` can be used. If this + /// is a reverse suffix, then `&bytes[..pos]` can be used. That is, for + /// forward suffixes, this is an inclusive starting position, where as for + /// reverse suffixes, this is an exclusive ending position. + pos: usize, + /// The period of this suffix. + /// + /// Note that this is NOT necessarily the period of the string from which + /// this suffix comes from. (It is always less than or equal to the period + /// of the original string.) + period: usize, +} + +impl Suffix { + fn forward(needle: &[u8], kind: SuffixKind) -> Suffix { + debug_assert!(!needle.is_empty()); + + // suffix represents our maximal (or minimal) suffix, along with + // its period. + let mut suffix = Suffix { pos: 0, period: 1 }; + // The start of a suffix in `needle` that we are considering as a + // more maximal (or minimal) suffix than what's in `suffix`. + let mut candidate_start = 1; + // The current offset of our suffixes that we're comparing. + // + // When the characters at this offset are the same, then we mush on + // to the next position since no decision is possible. When the + // candidate's character is greater (or lesser) than the corresponding + // character than our current maximal (or minimal) suffix, then the + // current suffix is changed over to the candidate and we restart our + // search. Otherwise, the candidate suffix is no good and we restart + // our search on the next candidate. + // + // The three cases above correspond to the three cases in the loop + // below. + let mut offset = 0; + + while candidate_start + offset < needle.len() { + let current = needle[suffix.pos + offset]; + let candidate = needle[candidate_start + offset]; + match kind.cmp(current, candidate) { + SuffixOrdering::Accept => { + suffix = Suffix { pos: candidate_start, period: 1 }; + candidate_start += 1; + offset = 0; + } + SuffixOrdering::Skip => { + candidate_start += offset + 1; + offset = 0; + suffix.period = candidate_start - suffix.pos; + } + SuffixOrdering::Push => { + if offset + 1 == suffix.period { + candidate_start += suffix.period; + offset = 0; + } else { + offset += 1; + } + } + } + } + suffix + } + + fn reverse(needle: &[u8], kind: SuffixKind) -> Suffix { + debug_assert!(!needle.is_empty()); + + // See the comments in `forward` for how this works. + let mut suffix = Suffix { pos: needle.len(), period: 1 }; + if needle.len() == 1 { + return suffix; + } + let mut candidate_start = needle.len() - 1; + let mut offset = 0; + + while offset < candidate_start { + let current = needle[suffix.pos - offset - 1]; + let candidate = needle[candidate_start - offset - 1]; + match kind.cmp(current, candidate) { + SuffixOrdering::Accept => { + suffix = Suffix { pos: candidate_start, period: 1 }; + candidate_start -= 1; + offset = 0; + } + SuffixOrdering::Skip => { + candidate_start -= offset + 1; + offset = 0; + suffix.period = suffix.pos - candidate_start; + } + SuffixOrdering::Push => { + if offset + 1 == suffix.period { + candidate_start -= suffix.period; + offset = 0; + } else { + offset += 1; + } + } + } + } + suffix + } +} + +/// The kind of suffix to extract. +#[derive(Clone, Copy, Debug)] +enum SuffixKind { + /// Extract the smallest lexicographic suffix from a string. + /// + /// Technically, this doesn't actually pick the smallest lexicographic + /// suffix. e.g., Given the choice between `a` and `aa`, this will choose + /// the latter over the former, even though `a < aa`. The reasoning for + /// this isn't clear from the paper, but it still smells like a minimal + /// suffix. + Minimal, + /// Extract the largest lexicographic suffix from a string. + /// + /// Unlike `Minimal`, this really does pick the maximum suffix. e.g., Given + /// the choice between `z` and `zz`, this will choose the latter over the + /// former. + Maximal, +} + +/// The result of comparing corresponding bytes between two suffixes. +#[derive(Clone, Copy, Debug)] +enum SuffixOrdering { + /// This occurs when the given candidate byte indicates that the candidate + /// suffix is better than the current maximal (or minimal) suffix. That is, + /// the current candidate suffix should supplant the current maximal (or + /// minimal) suffix. + Accept, + /// This occurs when the given candidate byte excludes the candidate suffix + /// from being better than the current maximal (or minimal) suffix. That + /// is, the current candidate suffix should be dropped and the next one + /// should be considered. + Skip, + /// This occurs when no decision to accept or skip the candidate suffix + /// can be made, e.g., when corresponding bytes are equivalent. In this + /// case, the next corresponding bytes should be compared. + Push, +} + +impl SuffixKind { + /// Returns true if and only if the given candidate byte indicates that + /// it should replace the current suffix as the maximal (or minimal) + /// suffix. + fn cmp(self, current: u8, candidate: u8) -> SuffixOrdering { + use self::SuffixOrdering::*; + + match self { + SuffixKind::Minimal if candidate < current => Accept, + SuffixKind::Minimal if candidate > current => Skip, + SuffixKind::Minimal => Push, + SuffixKind::Maximal if candidate > current => Accept, + SuffixKind::Maximal if candidate < current => Skip, + SuffixKind::Maximal => Push, + } + } +} + +/// A bitset used to track whether a particular byte exists in a needle or not. +/// +/// Namely, bit 'i' is set if and only if byte%64==i for any byte in the +/// needle. If a particular byte in the haystack is NOT in this set, then one +/// can conclude that it is also not in the needle, and thus, one can advance +/// in the haystack by needle.len() bytes. +#[derive(Clone, Copy, Debug)] +struct ApproximateByteSet(u64); + +impl ApproximateByteSet { + /// Create a new set from the given needle. + fn new(needle: &[u8]) -> ApproximateByteSet { + let mut bits = 0; + for &b in needle { + bits |= 1 << (b % 64); + } + ApproximateByteSet(bits) + } + + /// Return true if and only if the given byte might be in this set. This + /// may return a false positive, but will never return a false negative. + #[inline(always)] + fn contains(&self, byte: u8) -> bool { + self.0 & (1 << (byte % 64)) != 0 + } +} + +#[cfg(all(test, feature = "std", not(miri)))] +mod tests { + use quickcheck::quickcheck; + + use super::*; + + define_memmem_quickcheck_tests!( + super::simpletests::twoway_find, + super::simpletests::twoway_rfind + ); + + /// Convenience wrapper for computing the suffix as a byte string. + fn get_suffix_forward(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { + let s = Suffix::forward(needle, kind); + (&needle[s.pos..], s.period) + } + + /// Convenience wrapper for computing the reverse suffix as a byte string. + fn get_suffix_reverse(needle: &[u8], kind: SuffixKind) -> (&[u8], usize) { + let s = Suffix::reverse(needle, kind); + (&needle[..s.pos], s.period) + } + + /// Return all of the non-empty suffixes in the given byte string. + fn suffixes(bytes: &[u8]) -> Vec<&[u8]> { + (0..bytes.len()).map(|i| &bytes[i..]).collect() + } + + /// Return the lexicographically maximal suffix of the given byte string. + fn naive_maximal_suffix_forward(needle: &[u8]) -> &[u8] { + let mut sufs = suffixes(needle); + sufs.sort(); + sufs.pop().unwrap() + } + + /// Return the lexicographically maximal suffix of the reverse of the given + /// byte string. + fn naive_maximal_suffix_reverse(needle: &[u8]) -> Vec { + let mut reversed = needle.to_vec(); + reversed.reverse(); + let mut got = naive_maximal_suffix_forward(&reversed).to_vec(); + got.reverse(); + got + } + + #[test] + fn suffix_forward() { + macro_rules! assert_suffix_min { + ($given:expr, $expected:expr, $period:expr) => { + let (got_suffix, got_period) = + get_suffix_forward($given.as_bytes(), SuffixKind::Minimal); + let got_suffix = std::str::from_utf8(got_suffix).unwrap(); + assert_eq!(($expected, $period), (got_suffix, got_period)); + }; + } + + macro_rules! assert_suffix_max { + ($given:expr, $expected:expr, $period:expr) => { + let (got_suffix, got_period) = + get_suffix_forward($given.as_bytes(), SuffixKind::Maximal); + let got_suffix = std::str::from_utf8(got_suffix).unwrap(); + assert_eq!(($expected, $period), (got_suffix, got_period)); + }; + } + + assert_suffix_min!("a", "a", 1); + assert_suffix_max!("a", "a", 1); + + assert_suffix_min!("ab", "ab", 2); + assert_suffix_max!("ab", "b", 1); + + assert_suffix_min!("ba", "a", 1); + assert_suffix_max!("ba", "ba", 2); + + assert_suffix_min!("abc", "abc", 3); + assert_suffix_max!("abc", "c", 1); + + assert_suffix_min!("acb", "acb", 3); + assert_suffix_max!("acb", "cb", 2); + + assert_suffix_min!("cba", "a", 1); + assert_suffix_max!("cba", "cba", 3); + + assert_suffix_min!("abcabc", "abcabc", 3); + assert_suffix_max!("abcabc", "cabc", 3); + + assert_suffix_min!("abcabcabc", "abcabcabc", 3); + assert_suffix_max!("abcabcabc", "cabcabc", 3); + + assert_suffix_min!("abczz", "abczz", 5); + assert_suffix_max!("abczz", "zz", 1); + + assert_suffix_min!("zzabc", "abc", 3); + assert_suffix_max!("zzabc", "zzabc", 5); + + assert_suffix_min!("aaa", "aaa", 1); + assert_suffix_max!("aaa", "aaa", 1); + + assert_suffix_min!("foobar", "ar", 2); + assert_suffix_max!("foobar", "r", 1); + } + + #[test] + fn suffix_reverse() { + macro_rules! assert_suffix_min { + ($given:expr, $expected:expr, $period:expr) => { + let (got_suffix, got_period) = + get_suffix_reverse($given.as_bytes(), SuffixKind::Minimal); + let got_suffix = std::str::from_utf8(got_suffix).unwrap(); + assert_eq!(($expected, $period), (got_suffix, got_period)); + }; + } + + macro_rules! assert_suffix_max { + ($given:expr, $expected:expr, $period:expr) => { + let (got_suffix, got_period) = + get_suffix_reverse($given.as_bytes(), SuffixKind::Maximal); + let got_suffix = std::str::from_utf8(got_suffix).unwrap(); + assert_eq!(($expected, $period), (got_suffix, got_period)); + }; + } + + assert_suffix_min!("a", "a", 1); + assert_suffix_max!("a", "a", 1); + + assert_suffix_min!("ab", "a", 1); + assert_suffix_max!("ab", "ab", 2); + + assert_suffix_min!("ba", "ba", 2); + assert_suffix_max!("ba", "b", 1); + + assert_suffix_min!("abc", "a", 1); + assert_suffix_max!("abc", "abc", 3); + + assert_suffix_min!("acb", "a", 1); + assert_suffix_max!("acb", "ac", 2); + + assert_suffix_min!("cba", "cba", 3); + assert_suffix_max!("cba", "c", 1); + + assert_suffix_min!("abcabc", "abca", 3); + assert_suffix_max!("abcabc", "abcabc", 3); + + assert_suffix_min!("abcabcabc", "abcabca", 3); + assert_suffix_max!("abcabcabc", "abcabcabc", 3); + + assert_suffix_min!("abczz", "a", 1); + assert_suffix_max!("abczz", "abczz", 5); + + assert_suffix_min!("zzabc", "zza", 3); + assert_suffix_max!("zzabc", "zz", 1); + + assert_suffix_min!("aaa", "aaa", 1); + assert_suffix_max!("aaa", "aaa", 1); + } + + quickcheck! { + fn qc_suffix_forward_maximal(bytes: Vec) -> bool { + if bytes.is_empty() { + return true; + } + + let (got, _) = get_suffix_forward(&bytes, SuffixKind::Maximal); + let expected = naive_maximal_suffix_forward(&bytes); + got == expected + } + + fn qc_suffix_reverse_maximal(bytes: Vec) -> bool { + if bytes.is_empty() { + return true; + } + + let (got, _) = get_suffix_reverse(&bytes, SuffixKind::Maximal); + let expected = naive_maximal_suffix_reverse(&bytes); + expected == got + } + } +} + +#[cfg(test)] +mod simpletests { + use super::*; + + pub(crate) fn twoway_find( + haystack: &[u8], + needle: &[u8], + ) -> Option { + Forward::new(needle).find_general(None, haystack, needle) + } + + pub(crate) fn twoway_rfind( + haystack: &[u8], + needle: &[u8], + ) -> Option { + Reverse::new(needle).rfind_general(haystack, needle) + } + + define_memmem_simple_tests!(twoway_find, twoway_rfind); + + // This is a regression test caught by quickcheck that exercised a bug in + // the reverse small period handling. The bug was that we were using 'if j + // == shift' to determine if a match occurred, but the correct guard is 'if + // j >= shift', which matches the corresponding guard in the forward impl. + #[test] + fn regression_rev_small_period() { + let rfind = super::simpletests::twoway_rfind; + let haystack = "ababaz"; + let needle = "abab"; + assert_eq!(Some(0), rfind(haystack.as_bytes(), needle.as_bytes())); + } +} diff --git a/third_party/rust/memchr/src/memmem/util.rs b/third_party/rust/memchr/src/memmem/util.rs new file mode 100644 index 0000000000..de0e385e18 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/util.rs @@ -0,0 +1,88 @@ +// These routines are meant to be optimized specifically for low latency as +// compared to the equivalent routines offered by std. (Which may invoke the +// dynamic linker and call out to libc, which introduces a bit more latency +// than we'd like.) + +/// Returns true if and only if needle is a prefix of haystack. +#[inline(always)] +pub(crate) fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool { + needle.len() <= haystack.len() && memcmp(&haystack[..needle.len()], needle) +} + +/// Returns true if and only if needle is a suffix of haystack. +#[inline(always)] +pub(crate) fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool { + needle.len() <= haystack.len() + && memcmp(&haystack[haystack.len() - needle.len()..], needle) +} + +/// Return true if and only if x.len() == y.len() && x[i] == y[i] for all +/// 0 <= i < x.len(). +/// +/// Why not just use actual memcmp for this? Well, memcmp requires calling out +/// to libc, and this routine is called in fairly hot code paths. Other than +/// just calling out to libc, it also seems to result in worse codegen. By +/// rolling our own memcmp in pure Rust, it seems to appear more friendly to +/// the optimizer. +/// +/// We mark this as inline always, although, some callers may not want it +/// inlined for better codegen (like Rabin-Karp). In that case, callers are +/// advised to create a non-inlineable wrapper routine that calls memcmp. +#[inline(always)] +pub(crate) fn memcmp(x: &[u8], y: &[u8]) -> bool { + if x.len() != y.len() { + return false; + } + // If we don't have enough bytes to do 4-byte at a time loads, then + // fall back to the naive slow version. + // + // TODO: We could do a copy_nonoverlapping combined with a mask instead + // of a loop. Benchmark it. + if x.len() < 4 { + for (&b1, &b2) in x.iter().zip(y) { + if b1 != b2 { + return false; + } + } + return true; + } + // When we have 4 or more bytes to compare, then proceed in chunks of 4 at + // a time using unaligned loads. + // + // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is + // that this particular version of memcmp is likely to be called with tiny + // needles. That means that if we do 8 byte loads, then a higher proportion + // of memcmp calls will use the slower variant above. With that said, this + // is a hypothesis and is only loosely supported by benchmarks. There's + // likely some improvement that could be made here. The main thing here + // though is to optimize for latency, not throughput. + + // SAFETY: Via the conditional above, we know that both `px` and `py` + // have the same length, so `px < pxend` implies that `py < pyend`. + // Thus, derefencing both `px` and `py` in the loop below is safe. + // + // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual + // end of of `px` and `py`. Thus, the final dereference outside of the + // loop is guaranteed to be valid. (The final comparison will overlap with + // the last comparison done in the loop for lengths that aren't multiples + // of four.) + // + // Finally, we needn't worry about alignment here, since we do unaligned + // loads. + unsafe { + let (mut px, mut py) = (x.as_ptr(), y.as_ptr()); + let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4)); + while px < pxend { + let vx = (px as *const u32).read_unaligned(); + let vy = (py as *const u32).read_unaligned(); + if vx != vy { + return false; + } + px = px.add(4); + py = py.add(4); + } + let vx = (pxend as *const u32).read_unaligned(); + let vy = (pyend as *const u32).read_unaligned(); + vx == vy + } +} diff --git a/third_party/rust/memchr/src/memmem/vector.rs b/third_party/rust/memchr/src/memmem/vector.rs new file mode 100644 index 0000000000..b81165f8bc --- /dev/null +++ b/third_party/rust/memchr/src/memmem/vector.rs @@ -0,0 +1,131 @@ +/// A trait for describing vector operations used by vectorized searchers. +/// +/// The trait is highly constrained to low level vector operations needed. In +/// general, it was invented mostly to be generic over x86's __m128i and +/// __m256i types. It's likely that once std::simd becomes a thing, we can +/// migrate to that since the operations required are quite simple. +/// +/// TODO: Consider moving this trait up a level and using it to implement +/// memchr as well. The trait might need to grow one or two methods, but +/// otherwise should be close to sufficient already. +/// +/// # Safety +/// +/// All methods are not safe since they are intended to be implemented using +/// vendor intrinsics, which are also not safe. Callers must ensure that the +/// appropriate target features are enabled in the calling function, and that +/// the current CPU supports them. All implementations should avoid marking the +/// routines with #[target_feature] and instead mark them as #[inline(always)] +/// to ensure they get appropriately inlined. (inline(always) cannot be used +/// with target_feature.) +pub(crate) trait Vector: Copy + core::fmt::Debug { + /// _mm_set1_epi8 or _mm256_set1_epi8 + unsafe fn splat(byte: u8) -> Self; + /// _mm_loadu_si128 or _mm256_loadu_si256 + unsafe fn load_unaligned(data: *const u8) -> Self; + /// _mm_movemask_epi8 or _mm256_movemask_epi8 + unsafe fn movemask(self) -> u32; + /// _mm_cmpeq_epi8 or _mm256_cmpeq_epi8 + unsafe fn cmpeq(self, vector2: Self) -> Self; + /// _mm_and_si128 or _mm256_and_si256 + unsafe fn and(self, vector2: Self) -> Self; +} + +#[cfg(target_arch = "x86_64")] +mod x86sse { + use super::Vector; + use core::arch::x86_64::*; + + impl Vector for __m128i { + #[inline(always)] + unsafe fn splat(byte: u8) -> __m128i { + _mm_set1_epi8(byte as i8) + } + + #[inline(always)] + unsafe fn load_unaligned(data: *const u8) -> __m128i { + _mm_loadu_si128(data as *const __m128i) + } + + #[inline(always)] + unsafe fn movemask(self) -> u32 { + _mm_movemask_epi8(self) as u32 + } + + #[inline(always)] + unsafe fn cmpeq(self, vector2: Self) -> __m128i { + _mm_cmpeq_epi8(self, vector2) + } + + #[inline(always)] + unsafe fn and(self, vector2: Self) -> __m128i { + _mm_and_si128(self, vector2) + } + } +} + +#[cfg(all(feature = "std", target_arch = "x86_64"))] +mod x86avx { + use super::Vector; + use core::arch::x86_64::*; + + impl Vector for __m256i { + #[inline(always)] + unsafe fn splat(byte: u8) -> __m256i { + _mm256_set1_epi8(byte as i8) + } + + #[inline(always)] + unsafe fn load_unaligned(data: *const u8) -> __m256i { + _mm256_loadu_si256(data as *const __m256i) + } + + #[inline(always)] + unsafe fn movemask(self) -> u32 { + _mm256_movemask_epi8(self) as u32 + } + + #[inline(always)] + unsafe fn cmpeq(self, vector2: Self) -> __m256i { + _mm256_cmpeq_epi8(self, vector2) + } + + #[inline(always)] + unsafe fn and(self, vector2: Self) -> __m256i { + _mm256_and_si256(self, vector2) + } + } +} + +#[cfg(target_arch = "wasm32")] +mod wasm_simd128 { + use super::Vector; + use core::arch::wasm32::*; + + impl Vector for v128 { + #[inline(always)] + unsafe fn splat(byte: u8) -> v128 { + u8x16_splat(byte) + } + + #[inline(always)] + unsafe fn load_unaligned(data: *const u8) -> v128 { + v128_load(data.cast()) + } + + #[inline(always)] + unsafe fn movemask(self) -> u32 { + u8x16_bitmask(self).into() + } + + #[inline(always)] + unsafe fn cmpeq(self, vector2: Self) -> v128 { + u8x16_eq(self, vector2) + } + + #[inline(always)] + unsafe fn and(self, vector2: Self) -> v128 { + v128_and(self, vector2) + } + } +} diff --git a/third_party/rust/memchr/src/memmem/wasm.rs b/third_party/rust/memchr/src/memmem/wasm.rs new file mode 100644 index 0000000000..4e3ea985c1 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/wasm.rs @@ -0,0 +1,75 @@ +use core::arch::wasm32::v128; + +use crate::memmem::{genericsimd, NeedleInfo}; + +/// A `v128` accelerated vectorized substring search routine that only works on +/// small needles. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Forward(genericsimd::Forward); + +impl Forward { + /// Create a new "generic simd" forward searcher. If one could not be + /// created from the given inputs, then None is returned. + pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option { + if !cfg!(memchr_runtime_simd) { + return None; + } + genericsimd::Forward::new(ninfo, needle).map(Forward) + } + + /// Returns the minimum length of haystack that is needed for this searcher + /// to work. Passing a haystack with a length smaller than this will cause + /// `find` to panic. + #[inline(always)] + pub(crate) fn min_haystack_len(&self) -> usize { + self.0.min_haystack_len::() + } + + #[inline(always)] + pub(crate) fn find( + &self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + self.find_impl(haystack, needle) + } + + /// The implementation of find marked with the appropriate target feature. + #[target_feature(enable = "simd128")] + fn find_impl(&self, haystack: &[u8], needle: &[u8]) -> Option { + unsafe { genericsimd::fwd_find::(&self.0, haystack, needle) } + } +} + +#[cfg(all(test, feature = "std", not(miri)))] +mod tests { + use crate::memmem::{prefilter::PrefilterState, NeedleInfo}; + + fn find( + _: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], + ) -> Option { + super::Forward::new(ninfo, needle).unwrap().find(haystack, needle) + } + + #[test] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + + unsafe { + PrefilterTest::run_all_tests_filter(find, |t| { + // This substring searcher only works on certain configs, so + // filter our tests such that Forward::new will be guaranteed + // to succeed. (And also remove tests with a haystack that is + // too small.) + let fwd = match super::Forward::new(&t.ninfo, &t.needle) { + None => return false, + Some(fwd) => fwd, + }; + t.haystack.len() >= fwd.min_haystack_len() + }) + } + } +} diff --git a/third_party/rust/memchr/src/memmem/x86/avx.rs b/third_party/rust/memchr/src/memmem/x86/avx.rs new file mode 100644 index 0000000000..ce168dd377 --- /dev/null +++ b/third_party/rust/memchr/src/memmem/x86/avx.rs @@ -0,0 +1,139 @@ +#[cfg(not(feature = "std"))] +pub(crate) use self::nostd::Forward; +#[cfg(feature = "std")] +pub(crate) use self::std::Forward; + +#[cfg(feature = "std")] +mod std { + use core::arch::x86_64::{__m128i, __m256i}; + + use crate::memmem::{genericsimd, NeedleInfo}; + + /// An AVX accelerated vectorized substring search routine that only works + /// on small needles. + #[derive(Clone, Copy, Debug)] + pub(crate) struct Forward(genericsimd::Forward); + + impl Forward { + /// Create a new "generic simd" forward searcher. If one could not be + /// created from the given inputs, then None is returned. + pub(crate) fn new( + ninfo: &NeedleInfo, + needle: &[u8], + ) -> Option { + if !cfg!(memchr_runtime_avx) || !is_x86_feature_detected!("avx2") { + return None; + } + genericsimd::Forward::new(ninfo, needle).map(Forward) + } + + /// Returns the minimum length of haystack that is needed for this + /// searcher to work. Passing a haystack with a length smaller than + /// this will cause `find` to panic. + #[inline(always)] + pub(crate) fn min_haystack_len(&self) -> usize { + self.0.min_haystack_len::<__m128i>() + } + + #[inline(always)] + pub(crate) fn find( + &self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + // SAFETY: The only way a Forward value can exist is if the avx2 + // target feature is enabled. This is the only safety requirement + // for calling the genericsimd searcher. + unsafe { self.find_impl(haystack, needle) } + } + + /// The implementation of find marked with the appropriate target + /// feature. + /// + /// # Safety + /// + /// Callers must ensure that the avx2 CPU feature is enabled in the + /// current environment. + #[target_feature(enable = "avx2")] + unsafe fn find_impl( + &self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + if haystack.len() < self.0.min_haystack_len::<__m256i>() { + genericsimd::fwd_find::<__m128i>(&self.0, haystack, needle) + } else { + genericsimd::fwd_find::<__m256i>(&self.0, haystack, needle) + } + } + } +} + +// We still define the avx "forward" type on nostd to make caller code a bit +// simpler. This avoids needing a lot more conditional compilation. +#[cfg(not(feature = "std"))] +mod nostd { + use crate::memmem::NeedleInfo; + + #[derive(Clone, Copy, Debug)] + pub(crate) struct Forward(()); + + impl Forward { + pub(crate) fn new( + ninfo: &NeedleInfo, + needle: &[u8], + ) -> Option { + None + } + + pub(crate) fn min_haystack_len(&self) -> usize { + unreachable!() + } + + pub(crate) fn find( + &self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + unreachable!() + } + } +} + +#[cfg(all(test, feature = "std", not(miri)))] +mod tests { + use crate::memmem::{prefilter::PrefilterState, NeedleInfo}; + + fn find( + _: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], + ) -> Option { + super::Forward::new(ninfo, needle).unwrap().find(haystack, needle) + } + + #[test] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + + if !is_x86_feature_detected!("avx2") { + return; + } + // SAFETY: The safety of find only requires that the current CPU + // support AVX2, which we checked above. + unsafe { + PrefilterTest::run_all_tests_filter(find, |t| { + // This substring searcher only works on certain configs, so + // filter our tests such that Forward::new will be guaranteed + // to succeed. (And also remove tests with a haystack that is + // too small.) + let fwd = match super::Forward::new(&t.ninfo, &t.needle) { + None => return false, + Some(fwd) => fwd, + }; + t.haystack.len() >= fwd.min_haystack_len() + }) + } + } +} diff --git a/third_party/rust/memchr/src/memmem/x86/mod.rs b/third_party/rust/memchr/src/memmem/x86/mod.rs new file mode 100644 index 0000000000..c1cc73feea --- /dev/null +++ b/third_party/rust/memchr/src/memmem/x86/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod avx; +pub(crate) mod sse; diff --git a/third_party/rust/memchr/src/memmem/x86/sse.rs b/third_party/rust/memchr/src/memmem/x86/sse.rs new file mode 100644 index 0000000000..22e7d9933a --- /dev/null +++ b/third_party/rust/memchr/src/memmem/x86/sse.rs @@ -0,0 +1,89 @@ +use core::arch::x86_64::__m128i; + +use crate::memmem::{genericsimd, NeedleInfo}; + +/// An SSE accelerated vectorized substring search routine that only works on +/// small needles. +#[derive(Clone, Copy, Debug)] +pub(crate) struct Forward(genericsimd::Forward); + +impl Forward { + /// Create a new "generic simd" forward searcher. If one could not be + /// created from the given inputs, then None is returned. + pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option { + if !cfg!(memchr_runtime_sse2) { + return None; + } + genericsimd::Forward::new(ninfo, needle).map(Forward) + } + + /// Returns the minimum length of haystack that is needed for this searcher + /// to work. Passing a haystack with a length smaller than this will cause + /// `find` to panic. + #[inline(always)] + pub(crate) fn min_haystack_len(&self) -> usize { + self.0.min_haystack_len::<__m128i>() + } + + #[inline(always)] + pub(crate) fn find( + &self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + // SAFETY: sse2 is enabled on all x86_64 targets, so this is always + // safe to call. + unsafe { self.find_impl(haystack, needle) } + } + + /// The implementation of find marked with the appropriate target feature. + /// + /// # Safety + /// + /// This is safe to call in all cases since sse2 is guaranteed to be part + /// of x86_64. It is marked as unsafe because of the target feature + /// attribute. + #[target_feature(enable = "sse2")] + unsafe fn find_impl( + &self, + haystack: &[u8], + needle: &[u8], + ) -> Option { + genericsimd::fwd_find::<__m128i>(&self.0, haystack, needle) + } +} + +#[cfg(all(test, feature = "std", not(miri)))] +mod tests { + use crate::memmem::{prefilter::PrefilterState, NeedleInfo}; + + fn find( + _: &mut PrefilterState, + ninfo: &NeedleInfo, + haystack: &[u8], + needle: &[u8], + ) -> Option { + super::Forward::new(ninfo, needle).unwrap().find(haystack, needle) + } + + #[test] + fn prefilter_permutations() { + use crate::memmem::prefilter::tests::PrefilterTest; + + // SAFETY: sse2 is enabled on all x86_64 targets, so this is always + // safe to call. + unsafe { + PrefilterTest::run_all_tests_filter(find, |t| { + // This substring searcher only works on certain configs, so + // filter our tests such that Forward::new will be guaranteed + // to succeed. (And also remove tests with a haystack that is + // too small.) + let fwd = match super::Forward::new(&t.ninfo, &t.needle) { + None => return false, + Some(fwd) => fwd, + }; + t.haystack.len() >= fwd.min_haystack_len() + }) + } + } +} diff --git a/third_party/rust/memchr/src/tests/memchr/iter.rs b/third_party/rust/memchr/src/tests/memchr/iter.rs new file mode 100644 index 0000000000..80ea5c279d --- /dev/null +++ b/third_party/rust/memchr/src/tests/memchr/iter.rs @@ -0,0 +1,230 @@ +use quickcheck::quickcheck; + +use crate::{tests::memchr::testdata::memchr_tests, Memchr, Memchr2, Memchr3}; + +#[test] +fn memchr1_iter() { + for test in memchr_tests() { + test.iter_one(false, Memchr::new); + } +} + +#[test] +fn memchr2_iter() { + for test in memchr_tests() { + test.iter_two(false, Memchr2::new); + } +} + +#[test] +fn memchr3_iter() { + for test in memchr_tests() { + test.iter_three(false, Memchr3::new); + } +} + +#[test] +fn memrchr1_iter() { + for test in memchr_tests() { + test.iter_one(true, |n1, corpus| Memchr::new(n1, corpus).rev()); + } +} + +#[test] +fn memrchr2_iter() { + for test in memchr_tests() { + test.iter_two(true, |n1, n2, corpus| { + Memchr2::new(n1, n2, corpus).rev() + }) + } +} + +#[test] +fn memrchr3_iter() { + for test in memchr_tests() { + test.iter_three(true, |n1, n2, n3, corpus| { + Memchr3::new(n1, n2, n3, corpus).rev() + }) + } +} + +quickcheck! { + fn qc_memchr_double_ended_iter( + needle: u8, data: Vec, take_side: Vec + ) -> bool { + // make nonempty + let mut take_side = take_side; + if take_side.is_empty() { take_side.push(true) }; + + let iter = Memchr::new(needle, &data); + let all_found = double_ended_take( + iter, take_side.iter().cycle().cloned()); + + all_found.iter().cloned().eq(positions1(needle, &data)) + } + + fn qc_memchr2_double_ended_iter( + needle1: u8, needle2: u8, data: Vec, take_side: Vec + ) -> bool { + // make nonempty + let mut take_side = take_side; + if take_side.is_empty() { take_side.push(true) }; + + let iter = Memchr2::new(needle1, needle2, &data); + let all_found = double_ended_take( + iter, take_side.iter().cycle().cloned()); + + all_found.iter().cloned().eq(positions2(needle1, needle2, &data)) + } + + fn qc_memchr3_double_ended_iter( + needle1: u8, needle2: u8, needle3: u8, + data: Vec, take_side: Vec + ) -> bool { + // make nonempty + let mut take_side = take_side; + if take_side.is_empty() { take_side.push(true) }; + + let iter = Memchr3::new(needle1, needle2, needle3, &data); + let all_found = double_ended_take( + iter, take_side.iter().cycle().cloned()); + + all_found + .iter() + .cloned() + .eq(positions3(needle1, needle2, needle3, &data)) + } + + fn qc_memchr1_iter(data: Vec) -> bool { + let needle = 0; + let answer = positions1(needle, &data); + answer.eq(Memchr::new(needle, &data)) + } + + fn qc_memchr1_rev_iter(data: Vec) -> bool { + let needle = 0; + let answer = positions1(needle, &data); + answer.rev().eq(Memchr::new(needle, &data).rev()) + } + + fn qc_memchr2_iter(data: Vec) -> bool { + let needle1 = 0; + let needle2 = 1; + let answer = positions2(needle1, needle2, &data); + answer.eq(Memchr2::new(needle1, needle2, &data)) + } + + fn qc_memchr2_rev_iter(data: Vec) -> bool { + let needle1 = 0; + let needle2 = 1; + let answer = positions2(needle1, needle2, &data); + answer.rev().eq(Memchr2::new(needle1, needle2, &data).rev()) + } + + fn qc_memchr3_iter(data: Vec) -> bool { + let needle1 = 0; + let needle2 = 1; + let needle3 = 2; + let answer = positions3(needle1, needle2, needle3, &data); + answer.eq(Memchr3::new(needle1, needle2, needle3, &data)) + } + + fn qc_memchr3_rev_iter(data: Vec) -> bool { + let needle1 = 0; + let needle2 = 1; + let needle3 = 2; + let answer = positions3(needle1, needle2, needle3, &data); + answer.rev().eq(Memchr3::new(needle1, needle2, needle3, &data).rev()) + } + + fn qc_memchr1_iter_size_hint(data: Vec) -> bool { + // test that the size hint is within reasonable bounds + let needle = 0; + let mut iter = Memchr::new(needle, &data); + let mut real_count = data + .iter() + .filter(|&&elt| elt == needle) + .count(); + + while let Some(index) = iter.next() { + real_count -= 1; + let (lower, upper) = iter.size_hint(); + assert!(lower <= real_count); + assert!(upper.unwrap() >= real_count); + assert!(upper.unwrap() <= data.len() - index); + } + true + } +} + +// take items from a DEI, taking front for each true and back for each false. +// Return a vector with the concatenation of the fronts and the reverse of the +// backs. +fn double_ended_take(mut iter: I, take_side: J) -> Vec +where + I: DoubleEndedIterator, + J: Iterator, +{ + let mut found_front = Vec::new(); + let mut found_back = Vec::new(); + + for take_front in take_side { + if take_front { + if let Some(pos) = iter.next() { + found_front.push(pos); + } else { + break; + } + } else { + if let Some(pos) = iter.next_back() { + found_back.push(pos); + } else { + break; + } + }; + } + + let mut all_found = found_front; + all_found.extend(found_back.into_iter().rev()); + all_found +} + +// return an iterator of the 0-based indices of haystack that match the needle +fn positions1<'a>( + n1: u8, + haystack: &'a [u8], +) -> Box + 'a> { + let it = haystack + .iter() + .enumerate() + .filter(move |&(_, &b)| b == n1) + .map(|t| t.0); + Box::new(it) +} + +fn positions2<'a>( + n1: u8, + n2: u8, + haystack: &'a [u8], +) -> Box + 'a> { + let it = haystack + .iter() + .enumerate() + .filter(move |&(_, &b)| b == n1 || b == n2) + .map(|t| t.0); + Box::new(it) +} + +fn positions3<'a>( + n1: u8, + n2: u8, + n3: u8, + haystack: &'a [u8], +) -> Box + 'a> { + let it = haystack + .iter() + .enumerate() + .filter(move |&(_, &b)| b == n1 || b == n2 || b == n3) + .map(|t| t.0); + Box::new(it) +} diff --git a/third_party/rust/memchr/src/tests/memchr/memchr.rs b/third_party/rust/memchr/src/tests/memchr/memchr.rs new file mode 100644 index 0000000000..ac955ed68b --- /dev/null +++ b/third_party/rust/memchr/src/tests/memchr/memchr.rs @@ -0,0 +1,134 @@ +use quickcheck::quickcheck; + +use crate::{ + memchr, + memchr::{fallback, naive}, + memchr2, memchr3, memrchr, memrchr2, memrchr3, + tests::memchr::testdata::memchr_tests, +}; + +#[test] +fn memchr1_find() { + for test in memchr_tests() { + test.one(false, memchr); + } +} + +#[test] +fn memchr1_fallback_find() { + for test in memchr_tests() { + test.one(false, fallback::memchr); + } +} + +#[test] +fn memchr2_find() { + for test in memchr_tests() { + test.two(false, memchr2); + } +} + +#[test] +fn memchr2_fallback_find() { + for test in memchr_tests() { + test.two(false, fallback::memchr2); + } +} + +#[test] +fn memchr3_find() { + for test in memchr_tests() { + test.three(false, memchr3); + } +} + +#[test] +fn memchr3_fallback_find() { + for test in memchr_tests() { + test.three(false, fallback::memchr3); + } +} + +#[test] +fn memrchr1_find() { + for test in memchr_tests() { + test.one(true, memrchr); + } +} + +#[test] +fn memrchr1_fallback_find() { + for test in memchr_tests() { + test.one(true, fallback::memrchr); + } +} + +#[test] +fn memrchr2_find() { + for test in memchr_tests() { + test.two(true, memrchr2); + } +} + +#[test] +fn memrchr2_fallback_find() { + for test in memchr_tests() { + test.two(true, fallback::memrchr2); + } +} + +#[test] +fn memrchr3_find() { + for test in memchr_tests() { + test.three(true, memrchr3); + } +} + +#[test] +fn memrchr3_fallback_find() { + for test in memchr_tests() { + test.three(true, fallback::memrchr3); + } +} + +quickcheck! { + fn qc_memchr1_matches_naive(n1: u8, corpus: Vec) -> bool { + memchr(n1, &corpus) == naive::memchr(n1, &corpus) + } +} + +quickcheck! { + fn qc_memchr2_matches_naive(n1: u8, n2: u8, corpus: Vec) -> bool { + memchr2(n1, n2, &corpus) == naive::memchr2(n1, n2, &corpus) + } +} + +quickcheck! { + fn qc_memchr3_matches_naive( + n1: u8, n2: u8, n3: u8, + corpus: Vec + ) -> bool { + memchr3(n1, n2, n3, &corpus) == naive::memchr3(n1, n2, n3, &corpus) + } +} + +quickcheck! { + fn qc_memrchr1_matches_naive(n1: u8, corpus: Vec) -> bool { + memrchr(n1, &corpus) == naive::memrchr(n1, &corpus) + } +} + +quickcheck! { + fn qc_memrchr2_matches_naive(n1: u8, n2: u8, corpus: Vec) -> bool { + memrchr2(n1, n2, &corpus) == naive::memrchr2(n1, n2, &corpus) + } +} + +quickcheck! { + fn qc_memrchr3_matches_naive( + n1: u8, n2: u8, n3: u8, + corpus: Vec + ) -> bool { + memrchr3(n1, n2, n3, &corpus) == naive::memrchr3(n1, n2, n3, &corpus) + } +} diff --git a/third_party/rust/memchr/src/tests/memchr/mod.rs b/third_party/rust/memchr/src/tests/memchr/mod.rs new file mode 100644 index 0000000000..79f94ab56a --- /dev/null +++ b/third_party/rust/memchr/src/tests/memchr/mod.rs @@ -0,0 +1,7 @@ +#[cfg(all(feature = "std", not(miri)))] +mod iter; +#[cfg(all(feature = "std", not(miri)))] +mod memchr; +mod simple; +#[cfg(all(feature = "std", not(miri)))] +mod testdata; diff --git a/third_party/rust/memchr/src/tests/memchr/simple.rs b/third_party/rust/memchr/src/tests/memchr/simple.rs new file mode 100644 index 0000000000..bed5b48631 --- /dev/null +++ b/third_party/rust/memchr/src/tests/memchr/simple.rs @@ -0,0 +1,23 @@ +// Simple tests using MIRI. These are intended only to be a simple exercise of +// memchr when tests are run under miri. These are mostly necessary because the +// other tests are far more extensive and take too long to run under miri. +// +// These tests are also run when the 'std' feature is not enabled. + +use crate::{memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; + +#[test] +fn simple() { + assert_eq!(memchr(b'a', b"abcda"), Some(0)); + assert_eq!(memchr(b'z', b"abcda"), None); + assert_eq!(memchr2(b'a', b'z', b"abcda"), Some(0)); + assert_eq!(memchr2(b'z', b'y', b"abcda"), None); + assert_eq!(memchr3(b'a', b'z', b'b', b"abcda"), Some(0)); + assert_eq!(memchr3(b'z', b'y', b'x', b"abcda"), None); + assert_eq!(memrchr(b'a', b"abcda"), Some(4)); + assert_eq!(memrchr(b'z', b"abcda"), None); + assert_eq!(memrchr2(b'a', b'z', b"abcda"), Some(4)); + assert_eq!(memrchr2(b'z', b'y', b"abcda"), None); + assert_eq!(memrchr3(b'a', b'z', b'b', b"abcda"), Some(4)); + assert_eq!(memrchr3(b'z', b'y', b'x', b"abcda"), None); +} diff --git a/third_party/rust/memchr/src/tests/memchr/testdata.rs b/third_party/rust/memchr/src/tests/memchr/testdata.rs new file mode 100644 index 0000000000..6dda5246fd --- /dev/null +++ b/third_party/rust/memchr/src/tests/memchr/testdata.rs @@ -0,0 +1,351 @@ +use std::iter::repeat; + +/// Create a sequence of tests that should be run by memchr implementations. +pub fn memchr_tests() -> Vec { + let mut tests = Vec::new(); + for statict in MEMCHR_TESTS { + assert!(!statict.corpus.contains("%"), "% is not allowed in corpora"); + assert!(!statict.corpus.contains("#"), "# is not allowed in corpora"); + assert!(!statict.needles.contains(&b'%'), "% is an invalid needle"); + assert!(!statict.needles.contains(&b'#'), "# is an invalid needle"); + + let t = MemchrTest { + corpus: statict.corpus.to_string(), + needles: statict.needles.to_vec(), + positions: statict.positions.to_vec(), + }; + tests.push(t.clone()); + tests.extend(t.expand()); + } + tests +} + +/// A set of tests for memchr-like functions. +/// +/// These tests mostly try to cover the short string cases. We cover the longer +/// string cases via the benchmarks (which are tests themselves), via +/// quickcheck tests and via automatic expansion of each test case (by +/// increasing the corpus size). Finally, we cover different alignment cases +/// in the tests by varying the starting point of the slice. +const MEMCHR_TESTS: &[MemchrTestStatic] = &[ + // one needle (applied to memchr + memchr2 + memchr3) + MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0] }, + MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1] }, + MemchrTestStatic { + corpus: "aaa", + needles: &[b'a'], + positions: &[0, 1, 2], + }, + MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[] }, + MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[] }, + MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[] }, + MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2] }, + MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3] }, + MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3] }, + MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1] }, + MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0] }, + MemchrTestStatic { + corpus: "\x00\x00", + needles: &[b'\x00'], + positions: &[0, 1], + }, + MemchrTestStatic { + corpus: "\x00a\x00", + needles: &[b'\x00'], + positions: &[0, 2], + }, + MemchrTestStatic { + corpus: "zzzzzzzzzzzzzzzza", + needles: &[b'a'], + positions: &[16], + }, + MemchrTestStatic { + corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza", + needles: &[b'a'], + positions: &[32], + }, + // two needles (applied to memchr2 + memchr3) + MemchrTestStatic { + corpus: "az", + needles: &[b'a', b'z'], + positions: &[0, 1], + }, + MemchrTestStatic { + corpus: "az", + needles: &[b'a', b'z'], + positions: &[0, 1], + }, + MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[] }, + MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0] }, + MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1] }, + MemchrTestStatic { + corpus: "yyyyaz", + needles: &[b'a', b'z'], + positions: &[4, 5], + }, + MemchrTestStatic { + corpus: "yyyyaz", + needles: &[b'z', b'a'], + positions: &[4, 5], + }, + // three needles (applied to memchr3) + MemchrTestStatic { + corpus: "xyz", + needles: &[b'x', b'y', b'z'], + positions: &[0, 1, 2], + }, + MemchrTestStatic { + corpus: "zxy", + needles: &[b'x', b'y', b'z'], + positions: &[0, 1, 2], + }, + MemchrTestStatic { + corpus: "zxy", + needles: &[b'x', b'a', b'z'], + positions: &[0, 1], + }, + MemchrTestStatic { + corpus: "zxy", + needles: &[b't', b'a', b'z'], + positions: &[0], + }, + MemchrTestStatic { + corpus: "yxz", + needles: &[b't', b'a', b'z'], + positions: &[2], + }, +]; + +/// A description of a test on a memchr like function. +#[derive(Clone, Debug)] +pub struct MemchrTest { + /// The thing to search. We use `&str` instead of `&[u8]` because they + /// are nicer to write in tests, and we don't miss much since memchr + /// doesn't care about UTF-8. + /// + /// Corpora cannot contain either '%' or '#'. We use these bytes when + /// expanding test cases into many test cases, and we assume they are not + /// used. If they are used, `memchr_tests` will panic. + corpus: String, + /// The needles to search for. This is intended to be an "alternation" of + /// needles. The number of needles may cause this test to be skipped for + /// some memchr variants. For example, a test with 2 needles cannot be used + /// to test `memchr`, but can be used to test `memchr2` and `memchr3`. + /// However, a test with only 1 needle can be used to test all of `memchr`, + /// `memchr2` and `memchr3`. We achieve this by filling in the needles with + /// bytes that we never used in the corpus (such as '#'). + needles: Vec, + /// The positions expected to match for all of the needles. + positions: Vec, +} + +/// Like MemchrTest, but easier to define as a constant. +#[derive(Clone, Debug)] +pub struct MemchrTestStatic { + corpus: &'static str, + needles: &'static [u8], + positions: &'static [usize], +} + +impl MemchrTest { + pub fn one Option>(&self, reverse: bool, f: F) { + let needles = match self.needles(1) { + None => return, + Some(needles) => needles, + }; + // We test different alignments here. Since some implementations use + // AVX2, which can read 32 bytes at a time, we test at least that. + // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128 + // (avx) bytes at a time, so we include that in our offsets as well. + // + // You might think this would cause most needles to not be found, but + // we actually expand our tests to include corpus sizes all the way up + // to >500 bytes, so we should exercise most branches. + for align in 0..130 { + let corpus = self.corpus(align); + assert_eq!( + self.positions(align, reverse).get(0).cloned(), + f(needles[0], corpus.as_bytes()), + "search for {:?} failed in: {:?} (len: {}, alignment: {})", + needles[0] as char, + corpus, + corpus.len(), + align + ); + } + } + + pub fn two Option>( + &self, + reverse: bool, + f: F, + ) { + let needles = match self.needles(2) { + None => return, + Some(needles) => needles, + }; + for align in 0..130 { + let corpus = self.corpus(align); + assert_eq!( + self.positions(align, reverse).get(0).cloned(), + f(needles[0], needles[1], corpus.as_bytes()), + "search for {:?}|{:?} failed in: {:?} \ + (len: {}, alignment: {})", + needles[0] as char, + needles[1] as char, + corpus, + corpus.len(), + align + ); + } + } + + pub fn three Option>( + &self, + reverse: bool, + f: F, + ) { + let needles = match self.needles(3) { + None => return, + Some(needles) => needles, + }; + for align in 0..130 { + let corpus = self.corpus(align); + assert_eq!( + self.positions(align, reverse).get(0).cloned(), + f(needles[0], needles[1], needles[2], corpus.as_bytes()), + "search for {:?}|{:?}|{:?} failed in: {:?} \ + (len: {}, alignment: {})", + needles[0] as char, + needles[1] as char, + needles[2] as char, + corpus, + corpus.len(), + align + ); + } + } + + pub fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F) + where + F: FnOnce(u8, &'a [u8]) -> I, + I: Iterator, + { + if let Some(ns) = self.needles(1) { + self.iter(reverse, f(ns[0], self.corpus.as_bytes())); + } + } + + pub fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F) + where + F: FnOnce(u8, u8, &'a [u8]) -> I, + I: Iterator, + { + if let Some(ns) = self.needles(2) { + self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes())); + } + } + + pub fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F) + where + F: FnOnce(u8, u8, u8, &'a [u8]) -> I, + I: Iterator, + { + if let Some(ns) = self.needles(3) { + self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes())); + } + } + + /// Test that the positions yielded by the given iterator match the + /// positions in this test. If reverse is true, then reverse the positions + /// before comparing them. + fn iter>(&self, reverse: bool, it: I) { + assert_eq!( + self.positions(0, reverse), + it.collect::>(), + r"search for {:?} failed in: {:?}", + self.needles.iter().map(|&b| b as char).collect::>(), + self.corpus + ); + } + + /// Expand this test into many variations of the same test. + /// + /// In particular, this will generate more tests with larger corpus sizes. + /// The expected positions are updated to maintain the integrity of the + /// test. + /// + /// This is important in testing a memchr implementation, because there are + /// often different cases depending on the length of the corpus. + /// + /// Note that we extend the corpus by adding `%` bytes, which we + /// don't otherwise use as a needle. + fn expand(&self) -> Vec { + let mut more = Vec::new(); + + // Add bytes to the start of the corpus. + for i in 1..515 { + let mut t = self.clone(); + let mut new_corpus: String = repeat('%').take(i).collect(); + new_corpus.push_str(&t.corpus); + t.corpus = new_corpus; + t.positions = t.positions.into_iter().map(|p| p + i).collect(); + more.push(t); + } + // Add bytes to the end of the corpus. + for i in 1..515 { + let mut t = self.clone(); + let padding: String = repeat('%').take(i).collect(); + t.corpus.push_str(&padding); + more.push(t); + } + + more + } + + /// Return the corpus at the given alignment. + /// + /// If the alignment exceeds the length of the corpus, then this returns + /// an empty slice. + fn corpus(&self, align: usize) -> &str { + self.corpus.get(align..).unwrap_or("") + } + + /// Return exactly `count` needles from this test. If this test has less + /// than `count` needles, then add `#` until the number of needles + /// matches `count`. If this test has more than `count` needles, then + /// return `None` (because there is no way to use this test data for a + /// search using fewer needles). + fn needles(&self, count: usize) -> Option> { + if self.needles.len() > count { + return None; + } + + let mut needles = self.needles.to_vec(); + for _ in needles.len()..count { + // we assume # is never used in tests. + needles.push(b'#'); + } + Some(needles) + } + + /// Return the positions in this test, reversed if `reverse` is true. + /// + /// If alignment is given, then all positions greater than or equal to that + /// alignment are offset by the alignment. Positions less than the + /// alignment are dropped. + fn positions(&self, align: usize, reverse: bool) -> Vec { + let positions = if reverse { + let mut positions = self.positions.to_vec(); + positions.reverse(); + positions + } else { + self.positions.to_vec() + }; + positions + .into_iter() + .filter(|&p| p >= align) + .map(|p| p - align) + .collect() + } +} diff --git a/third_party/rust/memchr/src/tests/mod.rs b/third_party/rust/memchr/src/tests/mod.rs new file mode 100644 index 0000000000..f4d406cd1e --- /dev/null +++ b/third_party/rust/memchr/src/tests/mod.rs @@ -0,0 +1,15 @@ +mod memchr; + +// For debugging, particularly in CI, print out the byte order of the current +// target. +#[cfg(all(feature = "std", target_endian = "little"))] +#[test] +fn byte_order() { + eprintln!("LITTLE ENDIAN"); +} + +#[cfg(all(feature = "std", target_endian = "big"))] +#[test] +fn byte_order() { + eprintln!("BIG ENDIAN"); +} diff --git a/third_party/rust/memchr/src/tests/x86_64-soft_float.json b/third_party/rust/memchr/src/tests/x86_64-soft_float.json new file mode 100644 index 0000000000..b77649ef92 --- /dev/null +++ b/third_party/rust/memchr/src/tests/x86_64-soft_float.json @@ -0,0 +1,15 @@ +{ + "llvm-target": "x86_64-unknown-none", + "target-endian": "little", + "target-pointer-width": "64", + "target-c-int-width": "32", + "os": "none", + "arch": "x86_64", + "data-layout": "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", + "linker-flavor": "ld.lld", + "linker": "rust-lld", + "features": "-mmx,-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-3dnow,-3dnowa,-avx,-avx2,+soft-float", + "executables": true, + "disable-redzone": true, + "panic-strategy": "abort" +} -- cgit v1.2.3