diff options
Diffstat (limited to 'vendor/aho-corasick/src/packed/vector.rs')
-rw-r--r-- | vendor/aho-corasick/src/packed/vector.rs | 181 |
1 files changed, 181 insertions, 0 deletions
diff --git a/vendor/aho-corasick/src/packed/vector.rs b/vendor/aho-corasick/src/packed/vector.rs new file mode 100644 index 000000000..ca6c2b0f9 --- /dev/null +++ b/vendor/aho-corasick/src/packed/vector.rs @@ -0,0 +1,181 @@ +// This file contains a set of fairly generic utility functions when working +// with SIMD vectors. +// +// SAFETY: All of the routines below are unsafe to call because they assume +// the necessary CPU target features in order to use particular vendor +// intrinsics. Calling these routines when the underlying CPU does not support +// the appropriate target features is NOT safe. Callers must ensure this +// themselves. +// +// Note that it may not look like this safety invariant is being upheld when +// these routines are called. Namely, the CPU feature check is typically pretty +// far away from when these routines are used. Instead, we rely on the fact +// that certain types serve as a guaranteed receipt that pertinent target +// features are enabled. For example, the only way TeddySlim3Mask256 can be +// constructed is if the AVX2 CPU feature is available. Thus, any code running +// inside of TeddySlim3Mask256 can use any of the functions below without any +// additional checks: its very existence *is* the check. + +use std::arch::x86_64::*; + +/// Shift `a` to the left by two bytes (removing its two most significant +/// bytes), and concatenate it with the the two most significant bytes of `b`. +#[target_feature(enable = "avx2")] +pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i { + // Credit goes to jneem for figuring this out: + // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 + // + // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR + // instructions, which is not what we want, so we need to do some extra + // shuffling. + + // This permute gives us the low 16 bytes of a concatenated with the high + // 16 bytes of b, in order of most significant to least significant. So + // `v = a[15:0] b[31:16]`. + let v = _mm256_permute2x128_si256(b, a, 0x21); + // This effectively does this (where we deal in terms of byte-indexing + // and byte-shifting, and use inclusive ranges): + // + // ret[15:0] := ((a[15:0] << 16) | v[15:0]) >> 14 + // = ((a[15:0] << 16) | b[31:16]) >> 14 + // ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14 + // = ((a[31:16] << 16) | a[15:0]) >> 14 + // + // Which therefore results in: + // + // ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30] + // + // The end result is that we've effectively done this: + // + // (a << 2) | (b >> 30) + // + // When `A` and `B` are strings---where the beginning of the string is in + // the least significant bits---we effectively result in the following + // semantic operation: + // + // (A >> 2) | (B << 30) + // + // The reversal being attributed to the fact that we are in little-endian. + _mm256_alignr_epi8(a, v, 14) +} + +/// Shift `a` to the left by one byte (removing its most significant byte), and +/// concatenate it with the the most significant byte of `b`. +#[target_feature(enable = "avx2")] +pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i { + // For explanation, see alignr256_14. + let v = _mm256_permute2x128_si256(b, a, 0x21); + _mm256_alignr_epi8(a, v, 15) +} + +/// Unpack the given 128-bit vector into its 64-bit components. The first +/// element of the array returned corresponds to the least significant 64-bit +/// lane in `a`. +#[target_feature(enable = "ssse3")] +pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] { + [ + _mm_cvtsi128_si64(a) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64, + ] +} + +/// Unpack the given 256-bit vector into its 64-bit components. The first +/// element of the array returned corresponds to the least significant 64-bit +/// lane in `a`. +#[target_feature(enable = "avx2")] +pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] { + // Using transmute here is precisely equivalent, but actually slower. It's + // not quite clear why. + let lo = _mm256_extracti128_si256(a, 0); + let hi = _mm256_extracti128_si256(a, 1); + [ + _mm_cvtsi128_si64(lo) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, + _mm_cvtsi128_si64(hi) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, + ] +} + +/// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit +/// integers. +/// +/// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element +/// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits, +/// then the return value is `b2 b1 a2 a1`. +#[target_feature(enable = "avx2")] +pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] { + let lo = _mm256_castsi256_si128(a); + let hi = _mm256_castsi256_si128(b); + [ + _mm_cvtsi128_si64(lo) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, + _mm_cvtsi128_si64(hi) as u64, + _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, + ] +} + +/// Returns true if and only if all bits in the given 128-bit vector are 0. +#[target_feature(enable = "ssse3")] +pub unsafe fn is_all_zeroes128(a: __m128i) -> bool { + let cmp = _mm_cmpeq_epi8(a, zeroes128()); + _mm_movemask_epi8(cmp) as u32 == 0xFFFF +} + +/// Returns true if and only if all bits in the given 256-bit vector are 0. +#[target_feature(enable = "avx2")] +pub unsafe fn is_all_zeroes256(a: __m256i) -> bool { + let cmp = _mm256_cmpeq_epi8(a, zeroes256()); + _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF +} + +/// Load a 128-bit vector from slice at the given position. The slice does +/// not need to be unaligned. +/// +/// Since this code assumes little-endian (there is no big-endian x86), the +/// bytes starting in `slice[at..]` will be at the least significant bits of +/// the returned vector. This is important for the surrounding code, since for +/// example, shifting the resulting vector right is equivalent to logically +/// shifting the bytes in `slice` left. +#[target_feature(enable = "sse2")] +pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i { + let ptr = slice.get_unchecked(at..).as_ptr(); + _mm_loadu_si128(ptr as *const u8 as *const __m128i) +} + +/// Load a 256-bit vector from slice at the given position. The slice does +/// not need to be unaligned. +/// +/// Since this code assumes little-endian (there is no big-endian x86), the +/// bytes starting in `slice[at..]` will be at the least significant bits of +/// the returned vector. This is important for the surrounding code, since for +/// example, shifting the resulting vector right is equivalent to logically +/// shifting the bytes in `slice` left. +#[target_feature(enable = "avx2")] +pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i { + let ptr = slice.get_unchecked(at..).as_ptr(); + _mm256_loadu_si256(ptr as *const u8 as *const __m256i) +} + +/// Returns a 128-bit vector with all bits set to 0. +#[target_feature(enable = "sse2")] +pub unsafe fn zeroes128() -> __m128i { + _mm_set1_epi8(0) +} + +/// Returns a 256-bit vector with all bits set to 0. +#[target_feature(enable = "avx2")] +pub unsafe fn zeroes256() -> __m256i { + _mm256_set1_epi8(0) +} + +/// Returns a 128-bit vector with all bits set to 1. +#[target_feature(enable = "sse2")] +pub unsafe fn ones128() -> __m128i { + _mm_set1_epi8(0xFF as u8 as i8) +} + +/// Returns a 256-bit vector with all bits set to 1. +#[target_feature(enable = "avx2")] +pub unsafe fn ones256() -> __m256i { + _mm256_set1_epi8(0xFF as u8 as i8) +} |