1 files changed, 793 insertions, 0 deletions
diff --git a/third_party/rust/memchr/src/x86/sse2.rs b/third_party/rust/memchr/src/x86/sse2.rs
new file mode 100644
index 0000000000..76f5a78c34
--- /dev/null
+++ b/third_party/rust/memchr/src/x86/sse2.rs
@@ -0,0 +1,793 @@
+use core::arch::x86_64::*;
+use core::cmp;
+use core::mem::size_of;
+
+const VECTOR_SIZE: usize = size_of::<__m128i>();
+const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
+
+// The number of bytes to loop at in one iteration of memchr/memrchr.
+const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
+
+// The number of bytes to loop at in one iteration of memchr2/memrchr2 and
+// memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
+// in benchmarks. memchr3 in particular only gets a very slight speed up from
+// the loop unrolling.
+const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    // What follows is a fast SSE2-only algorithm to detect the position of
+    // `n1` in `haystack` if it exists. From what I know, this is the "classic"
+    // algorithm. I believe it can be found in places like glibc and Go's
+    // standard library. It appears to be well known and is elaborated on in
+    // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
+    //
+    // While this routine is very long, the basic idea is actually very simple
+    // and can be expressed straight-forwardly in pseudo code:
+    //
+    //     needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
+    //     // Note: shift amount in bytes
+    //
+    //     while i <= haystack.len() - 16:
+    //       // A 16 byte vector. Each byte in chunk corresponds to a byte in
+    //       // the haystack.
+    //       chunk = haystack[i:i+16]
+    //       // Compare bytes in needle with bytes in chunk. The result is a 16
+    //       // byte chunk where each byte is 0xFF if the corresponding bytes
+    //       // in needle and chunk were equal, or 0x00 otherwise.
+    //       eqs = cmpeq(needle, chunk)
+    //       // Return a 32 bit integer where the most significant 16 bits
+    //       // are always 0 and the lower 16 bits correspond to whether the
+    //       // most significant bit in the correspond byte in `eqs` is set.
+    //       // In other words, `mask as u16` has bit i set if and only if
+    //       // needle[i] == chunk[i].
+    //       mask = movemask(eqs)
+    //
+    //       // Mask is 0 if there is no match, and non-zero otherwise.
+    //       if mask != 0:
+    //         // trailing_zeros tells us the position of the least significant
+    //         // bit that is set.
+    //         return i + trailing_zeros(mask)
+    //
+    //     // haystack length may not be a multiple of 16, so search the rest.
+    //     while i < haystack.len():
+    //       if haystack[i] == n1:
+    //         return i
+    //
+    //     // No match found.
+    //     return NULL
+    //
+    // In fact, we could loosely translate the above code to Rust line-for-line
+    // and it would be a pretty fast algorithm. But, we pull out all the stops
+    // to go as fast as possible:
+    //
+    // 1. We use aligned loads. That is, we do some finagling to make sure our
+    //    primary loop not only proceeds in increments of 16 bytes, but that
+    //    the address of haystack's pointer that we dereference is aligned to
+    //    16 bytes. 16 is a magic number here because it is the size of SSE2
+    //    128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
+    //    Therefore, to get aligned loads, our pointer's address must be evenly
+    //    divisible by 16.
+    // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
+    //    kind of like loop unrolling, but we combine the equality comparisons
+    //    using a vector OR such that we only need to extract a single mask to
+    //    determine whether a match exists or not. If so, then we do some
+    //    book-keeping to determine the precise location but otherwise mush on.
+    // 3. We use our "chunk" comparison routine in as many places as possible,
+    //    even if it means using unaligned loads. In particular, if haystack
+    //    starts with an unaligned address, then we do an unaligned load to
+    //    search the first 16 bytes. We then start our primary loop at the
+    //    smallest subsequent aligned address, which will actually overlap with
+    //    previously searched bytes. But we're OK with that. We do a similar
+    //    dance at the end of our primary loop. Finally, to avoid a
+    //    byte-at-a-time loop at the end, we do a final 16 byte unaligned load
+    //    that may overlap with a previous load. This is OK because it converts
+    //    a loop into a small number of very fast vector instructions.
+    //
+    // The primary downside of this algorithm is that it's effectively
+    // completely unsafe. Therefore, we have to be super careful to avoid
+    // undefined behavior:
+    //
+    // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
+    //    require the pointer to be valid, but we actually can't even store the
+    //    address of an invalid pointer (unless it's 1 past the end of
+    //    haystack) without sacrificing performance.
+    // 2. _mm_loadu_si128 is used when you don't care about alignment, and
+    //    _mm_load_si128 is used when you do care. You cannot use the latter
+    //    on unaligned pointers.
+    // 3. We make liberal use of debug_assert! to check assumptions.
+    // 4. We make a concerted effort to stick with pointers instead of indices.
+    //    Indices are nicer because there's less to worry about with them (see
+    //    above about pointer offsets), but I could not get the compiler to
+    //    produce as good of code as what the below produces. In any case,
+    //    pointers are what we really care about here, and alignment is
+    //    expressed a bit more naturally with them.
+    //
+    // In general, most of the algorithms in this crate have a similar
+    // structure to what you see below, so this comment applies fairly well to
+    // all of them.
+
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
+        let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
+        let eqa = _mm_cmpeq_epi8(vn1, a);
+        let eqb = _mm_cmpeq_epi8(vn1, b);
+        let eqc = _mm_cmpeq_epi8(vn1, c);
+        let eqd = _mm_cmpeq_epi8(vn1, d);
+        let or1 = _mm_or_si128(eqa, eqb);
+        let or2 = _mm_or_si128(eqc, eqd);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask = _mm_movemask_epi8(eqa);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + forward_pos(mask));
+            }
+
+            at += VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqd);
+            debug_assert!(mask != 0);
+            return Some(at + forward_pos(mask));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
+
+        if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search1(start_ptr, end_ptr, ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + forward_pos2(mask1, mask2));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            return Some(at + forward_pos2(mask1, mask2));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memchr3(
+    n1: u8,
+    n2: u8,
+    n3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let vn3 = _mm_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = start_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr < end_ptr {
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+            ptr = ptr.offset(1);
+        }
+        return None;
+    }
+
+    if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
+    debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
+    while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm_cmpeq_epi8(vn3, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(eqa3, eqb3);
+        let or4 = _mm_or_si128(or1, or2);
+        let or5 = _mm_or_si128(or3, or4);
+        if _mm_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr, start_ptr);
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            let mask3 = _mm_movemask_epi8(eqa3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + forward_pos3(mask1, mask2, mask3));
+            }
+
+            at += VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            let mask3 = _mm_movemask_epi8(eqb3);
+            return Some(at + forward_pos3(mask1, mask2, mask3));
+        }
+        ptr = ptr.add(loop_size);
+    }
+    while ptr <= end_ptr.sub(VECTOR_SIZE) {
+        if let Some(i) =
+            forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
+        {
+            return Some(i);
+        }
+        ptr = ptr.add(VECTOR_SIZE);
+    }
+    if ptr < end_ptr {
+        debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
+        ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
+        debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
+
+        return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
+        let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
+        let eqa = _mm_cmpeq_epi8(vn1, a);
+        let eqb = _mm_cmpeq_epi8(vn1, b);
+        let eqc = _mm_cmpeq_epi8(vn1, c);
+        let eqd = _mm_cmpeq_epi8(vn1, d);
+        let or1 = _mm_or_si128(eqa, eqb);
+        let or2 = _mm_or_si128(eqc, eqd);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
+            let mask = _mm_movemask_epi8(eqd);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqc);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqb);
+            if mask != 0 {
+                return Some(at + reverse_pos(mask));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask = _mm_movemask_epi8(eqa);
+            debug_assert!(mask != 0);
+            return Some(at + reverse_pos(mask));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(or1, or2);
+        if _mm_movemask_epi8(or3) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            if mask1 != 0 || mask2 != 0 {
+                return Some(at + reverse_pos2(mask1, mask2));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            return Some(at + reverse_pos2(mask1, mask2));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn memrchr3(
+    n1: u8,
+    n2: u8,
+    n3: u8,
+    haystack: &[u8],
+) -> Option<usize> {
+    let vn1 = _mm_set1_epi8(n1 as i8);
+    let vn2 = _mm_set1_epi8(n2 as i8);
+    let vn3 = _mm_set1_epi8(n3 as i8);
+    let len = haystack.len();
+    let loop_size = cmp::min(LOOP_SIZE2, len);
+    let start_ptr = haystack.as_ptr();
+    let end_ptr = haystack[haystack.len()..].as_ptr();
+    let mut ptr = end_ptr;
+
+    if haystack.len() < VECTOR_SIZE {
+        while ptr > start_ptr {
+            ptr = ptr.offset(-1);
+            if *ptr == n1 || *ptr == n2 || *ptr == n3 {
+                return Some(sub(ptr, start_ptr));
+            }
+        }
+        return None;
+    }
+
+    ptr = ptr.sub(VECTOR_SIZE);
+    if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
+        return Some(i);
+    }
+
+    ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
+    debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
+    while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
+        debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
+
+        ptr = ptr.sub(loop_size);
+        let a = _mm_load_si128(ptr as *const __m128i);
+        let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
+        let eqa1 = _mm_cmpeq_epi8(vn1, a);
+        let eqb1 = _mm_cmpeq_epi8(vn1, b);
+        let eqa2 = _mm_cmpeq_epi8(vn2, a);
+        let eqb2 = _mm_cmpeq_epi8(vn2, b);
+        let eqa3 = _mm_cmpeq_epi8(vn3, a);
+        let eqb3 = _mm_cmpeq_epi8(vn3, b);
+        let or1 = _mm_or_si128(eqa1, eqb1);
+        let or2 = _mm_or_si128(eqa2, eqb2);
+        let or3 = _mm_or_si128(eqa3, eqb3);
+        let or4 = _mm_or_si128(or1, or2);
+        let or5 = _mm_or_si128(or3, or4);
+        if _mm_movemask_epi8(or5) != 0 {
+            let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
+            let mask1 = _mm_movemask_epi8(eqb1);
+            let mask2 = _mm_movemask_epi8(eqb2);
+            let mask3 = _mm_movemask_epi8(eqb3);
+            if mask1 != 0 || mask2 != 0 || mask3 != 0 {
+                return Some(at + reverse_pos3(mask1, mask2, mask3));
+            }
+
+            at -= VECTOR_SIZE;
+            let mask1 = _mm_movemask_epi8(eqa1);
+            let mask2 = _mm_movemask_epi8(eqa2);
+            let mask3 = _mm_movemask_epi8(eqa3);
+            return Some(at + reverse_pos3(mask1, mask2, mask3));
+        }
+    }
+    while ptr >= start_ptr.add(VECTOR_SIZE) {
+        ptr = ptr.sub(VECTOR_SIZE);
+        if let Some(i) =
+            reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
+        {
+            return Some(i);
+        }
+    }
+    if ptr > start_ptr {
+        debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
+        return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
+    }
+    None
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn forward_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + forward_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn forward_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+pub unsafe fn forward_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+    vn3: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm_cmpeq_epi8(chunk, vn3);
+    let or = _mm_or_si128(eq1, eq2);
+    if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        let mask3 = _mm_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search1(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk));
+    if mask != 0 {
+        Some(sub(ptr, start_ptr) + reverse_pos(mask))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search2(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
+    } else {
+        None
+    }
+}
+
+#[target_feature(enable = "sse2")]
+unsafe fn reverse_search3(
+    start_ptr: *const u8,
+    end_ptr: *const u8,
+    ptr: *const u8,
+    vn1: __m128i,
+    vn2: __m128i,
+    vn3: __m128i,
+) -> Option<usize> {
+    debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
+    debug_assert!(start_ptr <= ptr);
+    debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
+
+    let chunk = _mm_loadu_si128(ptr as *const __m128i);
+    let eq1 = _mm_cmpeq_epi8(chunk, vn1);
+    let eq2 = _mm_cmpeq_epi8(chunk, vn2);
+    let eq3 = _mm_cmpeq_epi8(chunk, vn3);
+    let or = _mm_or_si128(eq1, eq2);
+    if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
+        let mask1 = _mm_movemask_epi8(eq1);
+        let mask2 = _mm_movemask_epi8(eq2);
+        let mask3 = _mm_movemask_epi8(eq3);
+        Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
+    } else {
+        None
+    }
+}
+
+/// Compute the position of the first matching byte from the given mask. The
+/// position returned is always in the range [0, 15].
+///
+/// The mask given is expected to be the result of _mm_movemask_epi8.
+fn forward_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the least significant bit that is set
+    // corresponds to the position of our first matching byte. That position
+    // corresponds to the number of zeros after the least significant bit.
+    mask.trailing_zeros() as usize
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    forward_pos(mask1 | mask2)
+}
+
+/// Compute the position of the first matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    forward_pos(mask1 | mask2 | mask3)
+}
+
+/// Compute the position of the last matching byte from the given mask. The
+/// position returned is always in the range [0, 15].
+///
+/// The mask given is expected to be the result of _mm_movemask_epi8.
+fn reverse_pos(mask: i32) -> usize {
+    // We are dealing with little endian here, where the most significant byte
+    // is at a higher address. That means the most significant bit that is set
+    // corresponds to the position of our last matching byte. The position from
+    // the end of the mask is therefore the number of leading zeros in a 16
+    // bit integer, and the position from the start of the mask is therefore
+    // 16 - (leading zeros) - 1.
+    VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0);
+
+    reverse_pos(mask1 | mask2)
+}
+
+/// Compute the position of the last matching byte from the given masks. The
+/// position returned is always in the range [0, 15]. Each mask corresponds to
+/// the equality comparison of a single byte.
+///
+/// The masks given are expected to be the result of _mm_movemask_epi8, where
+/// at least one of the masks is non-zero (i.e., indicates a match).
+fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
+    debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
+
+    reverse_pos(mask1 | mask2 | mask3)
+}
+
+/// Subtract `b` from `a` and return the difference. `a` should be greater than
+/// or equal to `b`.
+fn sub(a: *const u8, b: *const u8) -> usize {
+    debug_assert!(a >= b);
+    (a as usize) - (b as usize)
+}