diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 18:00:34 +0000 |
commit | 3f619478f796eddbba6e39502fe941b285dd97b1 (patch) | |
tree | e2c7b5777f728320e5b5542b6213fd3591ba51e2 /libmariadb/external/zlib/crc32_simd.c | |
parent | Initial commit. (diff) | |
download | mariadb-upstream.tar.xz mariadb-upstream.zip |
Adding upstream version 1:10.11.6.upstream/1%10.11.6upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'libmariadb/external/zlib/crc32_simd.c')
-rw-r--r-- | libmariadb/external/zlib/crc32_simd.c | 244 |
1 files changed, 244 insertions, 0 deletions
diff --git a/libmariadb/external/zlib/crc32_simd.c b/libmariadb/external/zlib/crc32_simd.c new file mode 100644 index 00000000..cc68d4fc --- /dev/null +++ b/libmariadb/external/zlib/crc32_simd.c @@ -0,0 +1,244 @@ +/* crc32_simd.c + * + * Copyright 2017 The Chromium Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * found in the Chromium source repository LICENSE file. + */ + + +#include "crc32_simd.h" + +#if defined(CRC32_SIMD_SSE42_PCLMUL) + +/* + * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer + * length must be at least 64, and a multiple of 16. Based on: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 + */ + +#include <emmintrin.h> +#include <smmintrin.h> +#include <wmmintrin.h> + +uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */ + const unsigned char *buf, + z_size_t len, + uint32_t crc) +{ + /* + * Definitions of the bit-reflected domain constants k1,k2,k3, etc and + * the CRC32+Barrett polynomials given at the end of the paper. + */ + static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 }; + static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e }; + static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 }; + static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 }; + + __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + + /* + * There's at least one block of 64. + */ + x1 = _mm_loadu_si128((__m128i *)(buf + 0x00)); + x2 = _mm_loadu_si128((__m128i *)(buf + 0x10)); + x3 = _mm_loadu_si128((__m128i *)(buf + 0x20)); + x4 = _mm_loadu_si128((__m128i *)(buf + 0x30)); + + x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc)); + + x0 = _mm_load_si128((__m128i *)k1k2); + + buf += 64; + len -= 64; + + /* + * Parallel fold blocks of 64, if any. + */ + while (len >= 64) + { + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x6 = _mm_clmulepi64_si128(x2, x0, 0x00); + x7 = _mm_clmulepi64_si128(x3, x0, 0x00); + x8 = _mm_clmulepi64_si128(x4, x0, 0x00); + + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x2 = _mm_clmulepi64_si128(x2, x0, 0x11); + x3 = _mm_clmulepi64_si128(x3, x0, 0x11); + x4 = _mm_clmulepi64_si128(x4, x0, 0x11); + + y5 = _mm_loadu_si128((__m128i *)(buf + 0x00)); + y6 = _mm_loadu_si128((__m128i *)(buf + 0x10)); + y7 = _mm_loadu_si128((__m128i *)(buf + 0x20)); + y8 = _mm_loadu_si128((__m128i *)(buf + 0x30)); + + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_xor_si128(x2, x6); + x3 = _mm_xor_si128(x3, x7); + x4 = _mm_xor_si128(x4, x8); + + x1 = _mm_xor_si128(x1, y5); + x2 = _mm_xor_si128(x2, y6); + x3 = _mm_xor_si128(x3, y7); + x4 = _mm_xor_si128(x4, y8); + + buf += 64; + len -= 64; + } + + /* + * Fold into 128-bits. + */ + x0 = _mm_load_si128((__m128i *)k3k4); + + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_xor_si128(x1, x5); + + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x3); + x1 = _mm_xor_si128(x1, x5); + + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x4); + x1 = _mm_xor_si128(x1, x5); + + /* + * Single fold blocks of 16, if any. + */ + while (len >= 16) + { + x2 = _mm_loadu_si128((__m128i *)buf); + + x5 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_clmulepi64_si128(x1, x0, 0x11); + x1 = _mm_xor_si128(x1, x2); + x1 = _mm_xor_si128(x1, x5); + + buf += 16; + len -= 16; + } + + /* + * Fold 128-bits to 64-bits. + */ + x2 = _mm_clmulepi64_si128(x1, x0, 0x10); + x3 = _mm_setr_epi32(~0, 0, ~0, 0); + x1 = _mm_srli_si128(x1, 8); + x1 = _mm_xor_si128(x1, x2); + + x0 = _mm_loadl_epi64((__m128i*)k5k0); + + x2 = _mm_srli_si128(x1, 4); + x1 = _mm_and_si128(x1, x3); + x1 = _mm_clmulepi64_si128(x1, x0, 0x00); + x1 = _mm_xor_si128(x1, x2); + + /* + * Barret reduce to 32-bits. + */ + x0 = _mm_load_si128((__m128i*)poly); + + x2 = _mm_and_si128(x1, x3); + x2 = _mm_clmulepi64_si128(x2, x0, 0x10); + x2 = _mm_and_si128(x2, x3); + x2 = _mm_clmulepi64_si128(x2, x0, 0x00); + x1 = _mm_xor_si128(x1, x2); + + /* + * Return the crc32. + */ + return _mm_extract_epi32(x1, 1); +} + +#elif defined(CRC32_ARMV8_CRC32) + +/* CRC32 checksums using ARMv8-a crypto instructions. + * + * TODO: implement a version using the PMULL instruction. + */ + +#if defined(__clang__) +/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an + * armv8 target, which is incompatible with ThinLTO optimizations on Android. + * (Namely, mixing and matching different module-level targets makes ThinLTO + * warn, and Android defaults to armv7-a. This restriction does not apply to + * function-level `target`s, however.) + * + * Since we only need four crc intrinsics, and since clang's implementation of + * those are just wrappers around compiler builtins, it's simplest to #define + * those builtins directly. If this #define list grows too much (or we depend on + * an intrinsic that isn't a trivial wrapper), we may have to find a better way + * to go about this. + * + * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized + * feature for this target (ignoring feature)." This appears to be a harmless + * bug in clang. + */ +#define __crc32b __builtin_arm_crc32b +#define __crc32d __builtin_arm_crc32d +#define __crc32w __builtin_arm_crc32w +#define __crc32cw __builtin_arm_crc32cw + +#if defined(__aarch64__) +#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc"))) +#else // !defined(__aarch64__) +#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc"))) +#endif // defined(__aarch64__) + +#elif defined(__GNUC__) +/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not + * allowed. We can just include arm_acle.h. + */ +#include <arm_acle.h> +#define TARGET_ARMV8_WITH_CRC +#else // !defined(__GNUC__) && !defined(_aarch64__) +#error ARM CRC32 SIMD extensions only supported for Clang and GCC +#endif + +TARGET_ARMV8_WITH_CRC +uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc, + const unsigned char *buf, + z_size_t len) +{ + uint32_t c = (uint32_t) ~crc; + + while (len && ((uintptr_t)buf & 7)) { + c = __crc32b(c, *buf++); + --len; + } + + const uint64_t *buf8 = (const uint64_t *)buf; + + while (len >= 64) { + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + c = __crc32d(c, *buf8++); + len -= 64; + } + + while (len >= 8) { + c = __crc32d(c, *buf8++); + len -= 8; + } + + buf = (const unsigned char *)buf8; + + while (len--) { + c = __crc32b(c, *buf++); + } + + return ~c; +} + +#endif
\ No newline at end of file |