diff options
Diffstat (limited to 'mysys/crc32')
-rw-r--r-- | mysys/crc32/crc32_arm64.c | 338 | ||||
-rw-r--r-- | mysys/crc32/crc32_x86.c | 16 | ||||
-rw-r--r-- | mysys/crc32/crc32c.cc | 261 | ||||
-rw-r--r-- | mysys/crc32/crc32c_amd64.cc | 8 | ||||
-rw-r--r-- | mysys/crc32/crc32c_ppc.h | 3 | ||||
-rw-r--r-- | mysys/crc32/crc32c_x86.cc | 457 | ||||
-rw-r--r-- | mysys/crc32/crc_ppc64.h | 7 |
7 files changed, 770 insertions, 320 deletions
diff --git a/mysys/crc32/crc32_arm64.c b/mysys/crc32/crc32_arm64.c index 0e70c218..6588606a 100644 --- a/mysys/crc32/crc32_arm64.c +++ b/mysys/crc32/crc32_arm64.c @@ -1,13 +1,18 @@ #include <my_global.h> #include <string.h> #include <stdint.h> +#include <stddef.h> -static int pmull_supported; +typedef unsigned (*my_crc32_t)(unsigned, const void *, size_t); -#if defined(HAVE_ARMV8_CRC) +#ifdef HAVE_ARMV8_CRC -#if defined(__APPLE__) -#include <sys/sysctl.h> +# ifdef HAVE_ARMV8_CRYPTO +static unsigned crc32c_aarch64_pmull(unsigned, const void *, size_t); +# endif + +# ifdef __APPLE__ +# include <sys/sysctl.h> int crc32_aarch64_available(void) { @@ -18,17 +23,17 @@ int crc32_aarch64_available(void) return ret; } -const char *crc32c_aarch64_available(void) +my_crc32_t crc32c_aarch64_available(void) { - if (crc32_aarch64_available() == 0) - return NULL; - pmull_supported = 1; - return "Using ARMv8 crc32 + pmull instructions"; +# ifdef HAVE_ARMV8_CRYPTO + if (crc32_aarch64_available()) + return crc32c_aarch64_pmull; +# endif + return NULL; } - -#else -#include <sys/auxv.h> -#if defined(__FreeBSD__) +# else +# include <sys/auxv.h> +# ifdef __FreeBSD__ static unsigned long getauxval(unsigned int key) { unsigned long val; @@ -36,17 +41,17 @@ static unsigned long getauxval(unsigned int key) return 0ul; return val; } -#else -# include <asm/hwcap.h> -#endif +# else +# include <asm/hwcap.h> +# endif -#ifndef HWCAP_CRC32 -# define HWCAP_CRC32 (1 << 7) -#endif +# ifndef HWCAP_CRC32 +# define HWCAP_CRC32 (1 << 7) +# endif -#ifndef HWCAP_PMULL -# define HWCAP_PMULL (1 << 4) -#endif +# ifndef HWCAP_PMULL +# define HWCAP_PMULL (1 << 4) +# endif /* ARM made crc32 default from ARMv8.1 but optional in ARMv8A * Runtime check API. @@ -56,22 +61,37 @@ int crc32_aarch64_available(void) unsigned long auxv= getauxval(AT_HWCAP); return (auxv & HWCAP_CRC32) != 0; } +# endif + +# ifndef __APPLE__ +static unsigned crc32c_aarch64(unsigned, const void *, size_t); -const char *crc32c_aarch64_available(void) +my_crc32_t crc32c_aarch64_available(void) { unsigned long auxv= getauxval(AT_HWCAP); - if (!(auxv & HWCAP_CRC32)) return NULL; +# ifdef HAVE_ARMV8_CRYPTO + /* Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030). */ + if (auxv & HWCAP_PMULL) + return crc32c_aarch64_pmull; +# endif + return crc32c_aarch64; +} +# endif - pmull_supported= (auxv & HWCAP_PMULL) != 0; - if (pmull_supported) +const char *crc32c_aarch64_impl(my_crc32_t c) +{ +# ifdef HAVE_ARMV8_CRYPTO + if (c == crc32c_aarch64_pmull) return "Using ARMv8 crc32 + pmull instructions"; - else +# endif +# ifndef __APPLE__ + if (c == crc32c_aarch64) return "Using ARMv8 crc32 instructions"; +# endif + return NULL; } - -#endif /* __APPLE__ */ #endif /* HAVE_ARMV8_CRC */ #ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS @@ -157,131 +177,14 @@ asm(".arch_extension crypto"); PREF4X64L2(buffer,(PREF_OFFSET), 8) \ PREF4X64L2(buffer,(PREF_OFFSET), 12) -uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) +#ifndef __APPLE__ +static unsigned crc32c_aarch64(unsigned crc, const void *buf, size_t len) { - uint32_t crc0, crc1, crc2; int64_t length= (int64_t)len; + const unsigned char *buffer= buf; crc^= 0xffffffff; - /* Pmull runtime check here. - * Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030). - * - * Consider the condition that the target platform does support hardware crc32 - * but not support PMULL. In this condition, it should leverage the aarch64 - * crc32 instruction (__crc32c) and just only skip parallel computation (pmull/vmull) - * rather than skip all hardware crc32 instruction of computation. - */ - if (pmull_supported) - { -/* The following Macro (HAVE_ARMV8_CRYPTO) is used for compiling check */ -#ifdef HAVE_ARMV8_CRYPTO - -/* Crypto extension Support - * Parallel computation with 1024 Bytes (per block) - * Intrinsics Support - */ -# ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS - const poly64_t k1= 0xe417f38a, k2= 0x8f158014; - uint64_t t0, t1; - - /* Process per block size of 1024 Bytes - * A block size = 8 + 42*3*sizeof(uint64_t) + 8 - */ - while ((length-= 1024) >= 0) - { - /* Prefetch 3*1024 data for avoiding L2 cache miss */ - PREF1KL2(buffer, 1024*3); - /* Do first 8 bytes here for better pipelining */ - crc0= __crc32cd(crc, *(const uint64_t *)buffer); - crc1= 0; - crc2= 0; - buffer+= sizeof(uint64_t); - - /* Process block inline - * Process crc0 last to avoid dependency with above - */ - CRC32C7X3X8(buffer, 0); - CRC32C7X3X8(buffer, 1); - CRC32C7X3X8(buffer, 2); - CRC32C7X3X8(buffer, 3); - CRC32C7X3X8(buffer, 4); - CRC32C7X3X8(buffer, 5); - - buffer+= 42*3*sizeof(uint64_t); - /* Prefetch data for following block to avoid L1 cache miss */ - PREF1KL1(buffer, 1024); - - /* Last 8 bytes - * Merge crc0 and crc1 into crc2 - * crc1 multiply by K2 - * crc0 multiply by K1 - */ - t1= (uint64_t)vmull_p64(crc1, k2); - t0= (uint64_t)vmull_p64(crc0, k1); - crc= __crc32cd(crc2, *(const uint64_t *)buffer); - crc1= __crc32cd(0, t1); - crc^= crc1; - crc0= __crc32cd(0, t0); - crc^= crc0; - - buffer+= sizeof(uint64_t); - } - -# else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ - - /*No intrinsics*/ - __asm__("mov x16, #0xf38a \n\t" - "movk x16, #0xe417, lsl 16 \n\t" - "mov v1.2d[0], x16 \n\t" - "mov x16, #0x8014 \n\t" - "movk x16, #0x8f15, lsl 16 \n\t" - "mov v0.2d[0], x16 \n\t" - :::"x16"); - - while ((length-= 1024) >= 0) - { - PREF1KL2(buffer, 1024*3); - __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t" - :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):); - crc1= 0; - crc2= 0; - buffer+= sizeof(uint64_t); - - CRC32C7X3X8(buffer, 0); - CRC32C7X3X8(buffer, 1); - CRC32C7X3X8(buffer, 2); - CRC32C7X3X8(buffer, 3); - CRC32C7X3X8(buffer, 4); - CRC32C7X3X8(buffer, 5); - - buffer+= 42*3*sizeof(uint64_t); - PREF1KL1(buffer, 1024); - __asm__("mov v2.2d[0], %x[c1] \n\t" - "pmull v2.1q, v2.1d, v0.1d \n\t" - "mov v3.2d[0], %x[c0] \n\t" - "pmull v3.1q, v3.1d, v1.1d \n\t" - "crc32cx %w[c], %w[c2], %x[v] \n\t" - "mov %x[c1], v2.2d[0] \n\t" - "crc32cx %w[c1], wzr, %x[c1] \n\t" - "eor %w[c], %w[c], %w[c1] \n\t" - "mov %x[c0], v3.2d[0] \n\t" - "crc32cx %w[c0], wzr, %x[c0] \n\t" - "eor %w[c], %w[c], %w[c0] \n\t" - :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc) - :[v]"r"(*((const uint64_t *)buffer))); - buffer+= sizeof(uint64_t); - } -# endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ - - /* Done if Input data size is aligned with 1024 */ - if (!(length+= 1024)) - return ~crc; - -#endif /* HAVE_ARMV8_CRYPTO */ - - } // end if pmull_supported - while ((length-= sizeof(uint64_t)) >= 0) { CRC32CX(crc, *(uint64_t *)buffer); @@ -306,6 +209,143 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) return ~crc; } +#endif + +#ifdef HAVE_ARMV8_CRYPTO +static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len) +{ + int64_t length= (int64_t)len; + const unsigned char *buffer= buf; + + crc^= 0xffffffff; + + /* Crypto extension Support + * Parallel computation with 1024 Bytes (per block) + * Intrinsics Support + */ +# ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS + /* Process per block size of 1024 Bytes + * A block size = 8 + 42*3*sizeof(uint64_t) + 8 + */ + for (const poly64_t k1= 0xe417f38a, k2= 0x8f158014; (length-= 1024) >= 0; ) + { + uint32_t crc0, crc1, crc2; + uint64_t t0, t1; + /* Prefetch 3*1024 data for avoiding L2 cache miss */ + PREF1KL2(buffer, 1024*3); + /* Do first 8 bytes here for better pipelining */ + crc0= __crc32cd(crc, *(const uint64_t *)buffer); + crc1= 0; + crc2= 0; + buffer+= sizeof(uint64_t); + + /* Process block inline + * Process crc0 last to avoid dependency with above + */ + CRC32C7X3X8(buffer, 0); + CRC32C7X3X8(buffer, 1); + CRC32C7X3X8(buffer, 2); + CRC32C7X3X8(buffer, 3); + CRC32C7X3X8(buffer, 4); + CRC32C7X3X8(buffer, 5); + + buffer+= 42*3*sizeof(uint64_t); + /* Prefetch data for following block to avoid L1 cache miss */ + PREF1KL1(buffer, 1024); + + /* Last 8 bytes + * Merge crc0 and crc1 into crc2 + * crc1 multiply by K2 + * crc0 multiply by K1 + */ + t1= (uint64_t)vmull_p64(crc1, k2); + t0= (uint64_t)vmull_p64(crc0, k1); + crc= __crc32cd(crc2, *(const uint64_t *)buffer); + crc1= __crc32cd(0, t1); + crc^= crc1; + crc0= __crc32cd(0, t0); + crc^= crc0; + + buffer+= sizeof(uint64_t); + } + +# else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ + /*No intrinsics*/ + __asm__("mov x16, #0xf38a \n\t" + "movk x16, #0xe417, lsl 16 \n\t" + "mov v1.2d[0], x16 \n\t" + "mov x16, #0x8014 \n\t" + "movk x16, #0x8f15, lsl 16 \n\t" + "mov v0.2d[0], x16 \n\t" + :::"x16"); + + while ((length-= 1024) >= 0) + { + uint32_t crc0, crc1, crc2; + + PREF1KL2(buffer, 1024*3); + __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t" + :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):); + crc1= 0; + crc2= 0; + buffer+= sizeof(uint64_t); + + CRC32C7X3X8(buffer, 0); + CRC32C7X3X8(buffer, 1); + CRC32C7X3X8(buffer, 2); + CRC32C7X3X8(buffer, 3); + CRC32C7X3X8(buffer, 4); + CRC32C7X3X8(buffer, 5); + + buffer+= 42*3*sizeof(uint64_t); + PREF1KL1(buffer, 1024); + __asm__("mov v2.2d[0], %x[c1] \n\t" + "pmull v2.1q, v2.1d, v0.1d \n\t" + "mov v3.2d[0], %x[c0] \n\t" + "pmull v3.1q, v3.1d, v1.1d \n\t" + "crc32cx %w[c], %w[c2], %x[v] \n\t" + "mov %x[c1], v2.2d[0] \n\t" + "crc32cx %w[c1], wzr, %x[c1] \n\t" + "eor %w[c], %w[c], %w[c1] \n\t" + "mov %x[c0], v3.2d[0] \n\t" + "crc32cx %w[c0], wzr, %x[c0] \n\t" + "eor %w[c], %w[c], %w[c0] \n\t" + :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc) + :[v]"r"(*((const uint64_t *)buffer))); + buffer+= sizeof(uint64_t); + } +# endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ + + /* Done if Input data size is aligned with 1024 */ + length+= 1024; + if (length) + { + while ((length-= sizeof(uint64_t)) >= 0) + { + CRC32CX(crc, *(uint64_t *)buffer); + buffer+= sizeof(uint64_t); + } + + /* The following is more efficient than the straight loop */ + if (length & sizeof(uint32_t)) + { + CRC32CW(crc, *(uint32_t *)buffer); + buffer+= sizeof(uint32_t); + } + + if (length & sizeof(uint16_t)) + { + CRC32CH(crc, *(uint16_t *)buffer); + buffer+= sizeof(uint16_t); + } + + if (length & sizeof(uint8_t)) + CRC32CB(crc, *buffer); + } + + return ~crc; +} +#endif /* HAVE_ARMV8_CRYPTO */ /* There are multiple approaches to calculate crc. Approach-1: Process 8 bytes then 4 bytes then 2 bytes and then 1 bytes diff --git a/mysys/crc32/crc32_x86.c b/mysys/crc32/crc32_x86.c index f077399c..ab2522d6 100644 --- a/mysys/crc32/crc32_x86.c +++ b/mysys/crc32/crc32_x86.c @@ -56,11 +56,16 @@ #include <stddef.h> #ifdef __GNUC__ -#include <x86intrin.h> +# include <emmintrin.h> +# include <smmintrin.h> +# include <tmmintrin.h> +# include <wmmintrin.h> +# define USE_PCLMUL __attribute__((target("sse4.2,pclmul"))) #elif defined(_MSC_VER) -#include <intrin.h> +# include <intrin.h> +# define USE_PCLMUL /* nothing */ #else -#error "unknown compiler" +# error "unknown compiler" #endif /** @@ -71,6 +76,7 @@ * * @return \a reg << (\a num * 8) */ +USE_PCLMUL static inline __m128i xmm_shift_left(__m128i reg, const unsigned int num) { static const MY_ALIGNED(16) uint8_t crc_xmm_shift_tab[48]= { @@ -111,6 +117,7 @@ struct crcr_pclmulqdq_ctx * * @return New 16 byte folded data */ +USE_PCLMUL static inline __m128i crcr32_folding_round(const __m128i data_block, const __m128i precomp, const __m128i fold) { @@ -128,6 +135,7 @@ static inline __m128i crcr32_folding_round(const __m128i data_block, * * @return data reduced to 64 bits */ +USE_PCLMUL static inline __m128i crcr32_reduce_128_to_64(__m128i data128, const __m128i precomp) { __m128i tmp0, tmp1, tmp2; @@ -152,6 +160,7 @@ static inline __m128i crcr32_reduce_128_to_64(__m128i data128, const __m128i pre * * @return data reduced to 32 bits */ +USE_PCLMUL static inline uint32_t crcr32_reduce_64_to_32(__m128i data64, const __m128i precomp) { static const MY_ALIGNED(16) uint32_t mask1[4]= { @@ -188,6 +197,7 @@ static inline uint32_t crcr32_reduce_64_to_32(__m128i data64, const __m128i prec * * @return CRC for given \a data block (32 bits wide). */ +USE_PCLMUL static inline uint32_t crcr32_calc_pclmulqdq(const uint8_t *data, uint32_t data_len, uint32_t crc, const struct crcr_pclmulqdq_ctx *params) diff --git a/mysys/crc32/crc32c.cc b/mysys/crc32/crc32c.cc index 2bec041e..32a45478 100644 --- a/mysys/crc32/crc32c.cc +++ b/mysys/crc32/crc32c.cc @@ -19,52 +19,23 @@ #include <stddef.h> #include <stdint.h> #include <my_global.h> -#include <my_byteorder.h> -static inline uint32_t DecodeFixed32(const char *ptr) -{ - return uint4korr(ptr); -} - -#include <stdint.h> -#ifdef _MSC_VER -#include <intrin.h> -#endif - -#ifdef HAVE_SSE42 -# ifdef __GNUC__ -# include <cpuid.h> -# if __GNUC__ < 5 && !defined __clang__ -/* the headers do not really work in GCC before version 5 */ -# define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data) -# define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data) -# define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data) -# else -# include <nmmintrin.h> -# endif -# define USE_SSE42 __attribute__((target("sse4.2"))) -# else -# define USE_SSE42 /* nothing */ -# endif -#endif - #ifdef __powerpc64__ -#include "crc32c_ppc.h" - -#if __linux__ -#include <sys/auxv.h> +# include "crc32c_ppc.h" +# ifdef __linux__ +# include <sys/auxv.h> -#ifndef PPC_FEATURE2_VEC_CRYPTO -#define PPC_FEATURE2_VEC_CRYPTO 0x02000000 -#endif +# ifndef PPC_FEATURE2_VEC_CRYPTO +# define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +# endif -#ifndef AT_HWCAP2 -#define AT_HWCAP2 26 +# ifndef AT_HWCAP2 +# define AT_HWCAP2 26 +# endif +# endif #endif -#endif /* __linux__ */ - -#endif +typedef unsigned (*my_crc32_t)(unsigned, const void *, size_t); namespace mysys_namespace { namespace crc32c { @@ -75,6 +46,7 @@ static int arch_ppc_crc32 = 0; #endif /* __powerpc64__ */ #endif +alignas(CPU_LEVEL1_DCACHE_LINESIZE) static const uint32_t table0_[256] = { 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, @@ -341,8 +313,9 @@ static const uint32_t table3_[256] = { }; // Used to fetch a naturally-aligned 32-bit word in little endian byte-order -static inline uint32_t LE_LOAD32(const uint8_t *p) { - return DecodeFixed32(reinterpret_cast<const char*>(p)); +static inline uint32_t LE_LOAD32(const uint8_t *p) +{ + return uint4korr(reinterpret_cast<const char*>(p)); } static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) @@ -362,10 +335,7 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) table0_[c >> 24]; } -#ifdef ALIGN #undef ALIGN -#endif - // Align n to (1 << m) byte boundary #define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) @@ -374,70 +344,30 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) l = table0_[c] ^ (l >> 8); \ } while (0) -static uint32_t crc32c_slow(uint32_t crc, const char* buf, size_t size) -{ - const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); - const uint8_t *e = p + size; - uint64_t l = crc ^ 0xffffffffu; - - // Point x at first 16-byte aligned byte in string. This might be - // just past the end of the string. - const uintptr_t pval = reinterpret_cast<uintptr_t>(p); - const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4)); - if (x <= e) - // Process bytes until finished or p is 16-byte aligned - while (p != x) - STEP1; - // Process bytes 16 at a time - while ((e-p) >= 16) - { - Slow_CRC32(&l, &p); - Slow_CRC32(&l, &p); - } - // Process bytes 8 at a time - while ((e-p) >= 8) - Slow_CRC32(&l, &p); - // Process the last few bytes - while (p != e) - STEP1; - return static_cast<uint32_t>(l ^ 0xffffffffu); -} - -#if defined HAVE_POWER8 -#elif defined HAVE_ARMV8_CRC -#elif defined HAVE_SSE42 -constexpr uint32_t cpuid_ecx_SSE42= 1U << 20; -constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U<<1; - -static uint32_t cpuid_ecx() -{ -#ifdef __GNUC__ - uint32_t reax= 0, rebx= 0, recx= 0, redx= 0; - __cpuid(1, reax, rebx, recx, redx); - return recx; -#elif defined _MSC_VER - int regs[4]; - __cpuid(regs, 1); - return regs[2]; -#else -# error "unknown compiler" +#undef USE_SSE42 +#if defined _MSC_VER && (defined _M_X64 || defined _M_IX86) +# include <intrin.h> +# include <immintrin.h> +# define USE_SSE42 /* nothing */ +#elif defined __GNUC__ && (defined __i386__||defined __x86_64__) +# if __GNUC__ < 5 && !defined __clang_major__ +/* the headers do not really work in GCC before version 5 */ +# define _mm_crc32_u8(crc,data) __builtin_ia32_crc32qi(crc,data) +# define _mm_crc32_u32(crc,data) __builtin_ia32_crc32si(crc,data) +# define _mm_crc32_u64(crc,data) __builtin_ia32_crc32di(crc,data) +# else +# include <nmmintrin.h> +# endif +# define USE_SSE42 __attribute__((target("sse4.2"))) #endif -} - -extern "C" int crc32_pclmul_enabled(void) -{ - return !(~cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL); -} - -#if SIZEOF_SIZE_T == 8 -extern "C" uint32_t crc32c_3way(uint32_t crc, const char *buf, size_t len); -USE_SSE42 +#ifdef USE_SSE42 +# if SIZEOF_SIZE_T == 8 static inline uint64_t LE_LOAD64(const uint8_t *ptr) { return uint8korr(reinterpret_cast<const char*>(ptr)); } -#endif +# endif USE_SSE42 static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) @@ -453,10 +383,11 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) # endif } +extern "C" USE_SSE42 -static uint32_t crc32c_sse42(uint32_t crc, const char* buf, size_t size) +unsigned crc32c_sse42(unsigned crc, const void* buf, size_t size) { - const uint8_t *p = reinterpret_cast<const uint8_t *>(buf); + const uint8_t *p = static_cast<const uint8_t *>(buf); const uint8_t *e = p + size; uint64_t l = crc ^ 0xffffffffu; @@ -484,107 +415,111 @@ static uint32_t crc32c_sse42(uint32_t crc, const char* buf, size_t size) } #endif -typedef uint32_t (*Function)(uint32_t, const char*, size_t); +static unsigned crc32c_slow(unsigned crc, const void* buf, size_t size) +{ + const uint8_t *p = static_cast<const uint8_t *>(buf); + const uint8_t *e = p + size; + uint64_t l = crc ^ 0xffffffffu; -#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) -uint32_t ExtendPPCImpl(uint32_t crc, const char *buf, size_t size) { - return crc32c_ppc(crc, (const unsigned char *)buf, size); + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast<uintptr_t>(p); + const uint8_t* x = reinterpret_cast<const uint8_t*>(ALIGN(pval, 4)); + if (x <= e) + // Process bytes until finished or p is 16-byte aligned + while (p != x) + STEP1; + // Process bytes 16 at a time + while ((e-p) >= 16) + { + Slow_CRC32(&l, &p); + Slow_CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e-p) >= 8) + Slow_CRC32(&l, &p); + // Process the last few bytes + while (p != e) + STEP1; + return static_cast<uint32_t>(l ^ 0xffffffffu); } -#if __linux__ +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +# ifdef __linux__ static int arch_ppc_probe(void) { arch_ppc_crc32 = 0; -#if defined(__powerpc64__) +# if defined(__powerpc64__) if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1; -#endif /* __powerpc64__ */ +# endif /* __powerpc64__ */ return arch_ppc_crc32; } -#elif __FreeBSD_version >= 1200000 -#include <machine/cpu.h> -#include <sys/auxv.h> -#include <sys/elf_common.h> +# elif defined __FreeBSD_version && __FreeBSD_version >= 1200000 +# include <machine/cpu.h> +# include <sys/auxv.h> +# include <sys/elf_common.h> static int arch_ppc_probe(void) { unsigned long cpufeatures; arch_ppc_crc32 = 0; -#if defined(__powerpc64__) +# if defined(__powerpc64__) elf_aux_info(AT_HWCAP2, &cpufeatures, sizeof(cpufeatures)); if (cpufeatures & PPC_FEATURE2_HAS_VEC_CRYPTO) arch_ppc_crc32 = 1; -#endif /* __powerpc64__ */ +# endif /* __powerpc64__ */ return arch_ppc_crc32; } -#elif defined(_AIX) || defined(__OpenBSD__) +# elif defined(_AIX) || defined(__OpenBSD__) static int arch_ppc_probe(void) { arch_ppc_crc32 = 0; -#if defined(__powerpc64__) +# if defined(__powerpc64__) // AIX 7.1+/OpenBSD has vector crypto features on all POWER 8+ arch_ppc_crc32 = 1; -#endif /* __powerpc64__ */ +# endif /* __powerpc64__ */ return arch_ppc_crc32; } -#endif // __linux__ +# endif #endif #if defined(HAVE_ARMV8_CRC) -extern "C" const char *crc32c_aarch64_available(void); -extern "C" uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len); - -static uint32_t ExtendARMImpl(uint32_t crc, const char *buf, size_t size) { - return crc32c_aarch64(crc, (const unsigned char *)buf, (size_t) size); -} +extern "C" my_crc32_t crc32c_aarch64_available(void); +extern "C" const char *crc32c_aarch64_impl(my_crc32_t); +#elif defined __i386__||defined __x86_64__||defined _M_X64||defined _M_IX86 +extern "C" my_crc32_t crc32c_x86_available(void); +extern "C" const char *crc32c_x86_impl(my_crc32_t); #endif -static inline Function Choose_Extend() +static inline my_crc32_t Choose_Extend() { #if defined HAVE_POWER8 && defined HAS_ALTIVEC if (arch_ppc_probe()) - return ExtendPPCImpl; -#elif defined(HAVE_ARMV8_CRC) - if (crc32c_aarch64_available()) - return ExtendARMImpl; -#elif HAVE_SSE42 -# if defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8 - switch (cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) { - case cpuid_ecx_SSE42_AND_PCLMUL: - return crc32c_3way; - case cpuid_ecx_SSE42: - return crc32c_sse42; - } -# else - if (cpuid_ecx() & cpuid_ecx_SSE42) - return crc32c_sse42; -# endif + return crc32c_ppc; +#elif defined HAVE_ARMV8_CRC + if (my_crc32_t crc= crc32c_aarch64_available()) + return crc; +#elif defined __i386__||defined __x86_64__||defined _M_X64||defined _M_IX86 + if (my_crc32_t crc= crc32c_x86_available()) + return crc; #endif return crc32c_slow; } -static const Function ChosenExtend= Choose_Extend(); - -static inline uint32_t Extend(uint32_t crc, const char* buf, size_t size) -{ - return ChosenExtend(crc, buf, size); -} +static const my_crc32_t ChosenExtend= Choose_Extend(); extern "C" const char *my_crc32c_implementation() { -#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) - if (ChosenExtend == ExtendPPCImpl) +#if defined HAVE_POWER8 && defined HAS_ALTIVEC + if (ChosenExtend == crc32c_ppc) return "Using POWER8 crc32 instructions"; -#elif defined(HAVE_ARMV8_CRC) - if (const char *ret= crc32c_aarch64_available()) +#elif defined HAVE_ARMV8_CRC + if (const char *ret= crc32c_aarch64_impl(ChosenExtend)) + return ret; +#elif defined __i386__||defined __x86_64__||defined _M_X64||defined _M_IX86 + if (const char *ret= crc32c_x86_impl(ChosenExtend)) return ret; -#elif HAVE_SSE42 -# if defined HAVE_PCLMUL && SIZEOF_SIZE_T == 8 - if (ChosenExtend == crc32c_3way) - return "Using crc32 + pclmulqdq instructions"; -# endif - if (ChosenExtend == crc32c_sse42) - return "Using SSE4.2 crc32 instructions"; #endif return "Using generic crc32 instructions"; } @@ -593,5 +528,5 @@ extern "C" const char *my_crc32c_implementation() extern "C" unsigned my_crc32c(unsigned int crc, const char *buf, size_t size) { - return mysys_namespace::crc32c::Extend(crc,buf, size); + return mysys_namespace::crc32c::ChosenExtend(crc,buf, size); } diff --git a/mysys/crc32/crc32c_amd64.cc b/mysys/crc32/crc32c_amd64.cc index 22c492b4..147c0cca 100644 --- a/mysys/crc32/crc32c_amd64.cc +++ b/mysys/crc32/crc32c_amd64.cc @@ -47,6 +47,11 @@ #include <nmmintrin.h> #include <wmmintrin.h> +#ifdef _MSC_VER +# define USE_PCLMUL /* nothing */ +#else +# define USE_PCLMUL __attribute__((target("sse4.2,pclmul"))) +#endif #define CRCtriplet(crc, buf, offset) \ crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ @@ -131,6 +136,7 @@ static const uint64_t clmul_constants alignas(16) [] = { }; // Compute the crc32c value for buffer smaller than 8 +USE_PCLMUL static inline void align_to_8( size_t len, uint64_t& crc0, // crc so far, updated on return @@ -155,6 +161,7 @@ static inline void align_to_8( // CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well // chosen constant and xor's these with the remaining CRC. // +USE_PCLMUL static inline uint64_t CombineCRC( size_t block_size, uint64_t crc0, @@ -176,6 +183,7 @@ static inline uint64_t CombineCRC( // Compute CRC-32C using the Intel hardware instruction. extern "C" +USE_PCLMUL uint32_t crc32c_3way(uint32_t crc, const char *buf, size_t len) { const unsigned char* next = (const unsigned char*)buf; diff --git a/mysys/crc32/crc32c_ppc.h b/mysys/crc32/crc32c_ppc.h index c359061c..797e849b 100644 --- a/mysys/crc32/crc32c_ppc.h +++ b/mysys/crc32/crc32c_ppc.h @@ -11,8 +11,7 @@ extern "C" { #endif -extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, - unsigned len); +extern unsigned crc32c_ppc(unsigned crc, const void *buffer, size_t len); #ifdef __cplusplus } diff --git a/mysys/crc32/crc32c_x86.cc b/mysys/crc32/crc32c_x86.cc new file mode 100644 index 00000000..02dbf292 --- /dev/null +++ b/mysys/crc32/crc32c_x86.cc @@ -0,0 +1,457 @@ +/* Copyright (c) 2024, MariaDB plc + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +#include <my_global.h> +#include <cstddef> +#include <cstdint> + +#ifdef _MSC_VER +# include <intrin.h> +# if 0 /* So far, we have no environment where this could be tested. */ +# define USE_VPCLMULQDQ /* nothing */ +# endif +#else +# include <cpuid.h> +# if __GNUC__ >= 11 || (defined __clang_major__ && __clang_major__ >= 8) +# define TARGET "pclmul,avx512f,avx512dq,avx512bw,avx512vl,vpclmulqdq" +# define USE_VPCLMULQDQ __attribute__((target(TARGET))) +# endif +#endif + +extern "C" unsigned crc32c_sse42(unsigned crc, const void* buf, size_t size); + +constexpr uint32_t cpuid_ecx_SSE42= 1U << 20; +constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U << 1; + +static uint32_t cpuid_ecx() +{ +#ifdef __GNUC__ + uint32_t reax= 0, rebx= 0, recx= 0, redx= 0; + __cpuid(1, reax, rebx, recx, redx); + return recx; +#elif defined _MSC_VER + int regs[4]; + __cpuid(regs, 1); + return regs[2]; +#else +# error "unknown compiler" +#endif +} + +typedef unsigned (*my_crc32_t)(unsigned, const void *, size_t); +extern "C" unsigned int crc32_pclmul(unsigned int, const void *, size_t); +extern "C" unsigned int crc32c_3way(unsigned int, const void *, size_t); + +#ifdef USE_VPCLMULQDQ +# include <immintrin.h> + +# ifdef _MSC_VER +/* MSVC does not seem to define this intrinsic for vmovdqa */ +# define _mm_load_epi32(x) *reinterpret_cast<const __m128i*>(x) +# endif + +/* + This implementation is based on + crc32_by16_vclmul_avx512 and crc32_refl_by16_vclmul_avx512 + in https://github.com/intel/intel-ipsec-mb/ with some optimizations. + The // comments in crc32_avx512() correspond to assembler labels. +*/ + +/** table of constants corresponding to a CRC polynomial up to degree 32 */ +struct alignas(64) crc32_tab +{ + const uint64_t b2048[2], b1024[2]; + alignas(64) const uint64_t b896[6]; /* includes b786, b640 */ + const uint64_t b512[2]; + const uint64_t b384[2], b256[2], b128[2], zeropad_for_b384[2]; + const uint64_t b64[2], b32[2]; +}; + +/** ISO 3309 CRC-32 (reflected polynomial 0x04C11DB7); zlib crc32() */ +static const crc32_tab refl32 = { + { 0x00000000e95c1271, 0x00000000ce3371cb }, + { 0x00000000910eeec1, 0x0000000033fff533 }, + { 0x000000000cbec0ed, 0x0000000031f8303f, + 0x0000000057c54819, 0x00000000df068dc2, + 0x00000000ae0b5394, 0x000000001c279815 }, + { 0x000000001d9513d7, 0x000000008f352d95 }, + { 0x00000000af449247, 0x000000003db1ecdc }, + { 0x0000000081256527, 0x00000000f1da05aa }, + { 0x00000000ccaa009e, 0x00000000ae689191 }, + { 0, 0 }, + { 0x00000000ccaa009e, 0x00000000b8bc6765 }, + { 0x00000001f7011640, 0x00000001db710640 } +}; + +/** Castagnoli CRC-32C (reflected polynomial 0x1EDC6F41) */ +static const crc32_tab refl32c = { + { 0x00000000b9e02b86, 0x00000000dcb17aa4 }, + { 0x000000000d3b6092, 0x000000006992cea2 }, + { 0x0000000047db8317, 0x000000002ad91c30, + 0x000000000715ce53, 0x00000000c49f4f67, + 0x0000000039d3b296, 0x00000000083a6eec }, + { 0x000000009e4addf8, 0x00000000740eef02 }, + { 0x00000000ddc0152b, 0x000000001c291d04 }, + { 0x00000000ba4fc28e, 0x000000003da6d0cb }, + { 0x00000000493c7d27, 0x00000000f20c0dfe }, + { 0, 0 }, + { 0x00000000493c7d27, 0x00000000dd45aab8 }, + { 0x00000000dea713f0, 0x0000000105ec76f0 } +}; + +/** Some ternary functions */ +class ternary +{ + static constexpr uint8_t A = 0b11110000; + static constexpr uint8_t B = 0b11001100; + static constexpr uint8_t C = 0b10101010; +public: + static constexpr uint8_t XOR3 = A ^ B ^ C; + static constexpr uint8_t XNOR3 = uint8_t(~(A ^ B ^ C)); + static constexpr uint8_t XOR2_AND = (A ^ B) & C; +}; + +USE_VPCLMULQDQ +/** @return a^b^c */ +static inline __m128i xor3_128(__m128i a, __m128i b, __m128i c) +{ + return _mm_ternarylogic_epi64(a, b, c, ternary::XOR3); +} + +USE_VPCLMULQDQ +/** @return ~(a^b^c) */ +static inline __m128i xnor3_128(__m128i a, __m128i b, __m128i c) +{ + return _mm_ternarylogic_epi64(a, b, c, ternary::XNOR3); +} + +USE_VPCLMULQDQ +/** @return a^b^c */ +static inline __m512i xor3_512(__m512i a, __m512i b, __m512i c) +{ + return _mm512_ternarylogic_epi64(a, b, c, ternary::XOR3); +} + +USE_VPCLMULQDQ +/** @return (a^b)&c */ +static inline __m128i xor2_and_128(__m128i a, __m128i b, __m128i c) +{ + return _mm_ternarylogic_epi64(a, b, c, ternary::XOR2_AND); +} + +USE_VPCLMULQDQ +/** Load 64 bytes */ +static inline __m512i load512(const char *b) { return _mm512_loadu_epi8(b); } + +USE_VPCLMULQDQ +/** Load 16 bytes */ +static inline __m128i load128(const char *b) { return _mm_loadu_epi64(b); } + +/** Combine 512 data bits with CRC */ +USE_VPCLMULQDQ +static inline __m512i combine512(__m512i a, __m512i tab, __m512i b) +{ + return xor3_512(b, _mm512_clmulepi64_epi128(a, tab, 0x01), + _mm512_clmulepi64_epi128(a, tab, 0x10)); +} + +# define xor512(a, b) _mm512_xor_epi64(a, b) +# define xor256(a, b) _mm256_xor_epi64(a, b) +# define xor128(a, b) _mm_xor_epi64(a, b) +# define and128(a, b) _mm_and_si128(a, b) + +template<uint8_t bits> USE_VPCLMULQDQ +/** Pick a 128-bit component of a 512-bit vector */ +static inline __m512i extract512_128(__m512i a) +{ + static_assert(bits <= 3, "usage"); +# if defined __GNUC__ && __GNUC__ >= 11 + /* While technically incorrect, this would seem to translate into a + vextracti32x4 instruction, which actually outputs a ZMM register + (anything above the XMM range is cleared). */ + return _mm512_castsi128_si512(_mm512_extracti64x2_epi64(a, bits)); +# else + /* On clang, this is needed in order to get a correct result. */ + return _mm512_maskz_shuffle_i64x2(3, a, a, bits); +# endif +} + +alignas(16) static const uint64_t shuffle128[4] = { + 0x8786858483828100, 0x8f8e8d8c8b8a8988, + 0x0706050403020100, 0x000e0d0c0b0a0908 +}; + +static const __mmask16 size_mask[16] = { + 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, + 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff +}; + +alignas(16) static const uint64_t shift128[4] = { + 0x8786858483828100, 0x8f8e8d8c8b8a8988, + 0x0706050403020100, 0x000e0d0c0b0a0908 +}; + +static const char shift_1_to_3_reflect[7 + 11] = { + -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 +}; + +USE_VPCLMULQDQ +static unsigned crc32_avx512(unsigned crc, const char *buf, size_t size, + const crc32_tab &tab) +{ + const __m512i crc_in = _mm512_castsi128_si512(_mm_cvtsi32_si128(~crc)), + b512 = _mm512_broadcast_i32x4(_mm_load_epi32(tab.b512)); + __m128i crc_out; + __m512i lo; + + if (size >= 256) { + lo = xor512(load512(buf), crc_in); + __m512i l1 = load512(buf + 64); + + const __m512i b1024 = _mm512_broadcast_i32x4(_mm_load_epi32(&tab.b1024)); + size -= 256; + if (size >= 256) { + __m512i h0 = load512(buf + 128), + hi = load512(buf + 192); + const __m512i b2048 = _mm512_broadcast_i32x4(_mm_load_epi32(&tab.b2048)); + size -= 256; + do { + buf += 256; + lo = combine512(lo, b2048, load512(buf)); + l1 = combine512(l1, b2048, load512(buf + 64)); + h0 = combine512(h0, b2048, load512(buf + 128)); + hi = combine512(hi, b2048, load512(buf + 192)); + size -= 256; + } while (ssize_t(size) >= 0); + + buf += 256; + lo = combine512(lo, b1024, h0); + l1 = combine512(l1, b1024, hi); + size += 128; + } else { + do { + buf += 128; + lo = combine512(lo, b1024, load512(buf)); + l1 = combine512(l1, b1024, load512(buf + 64)); + size -= 128; + } while (ssize_t(size) >= 0); + + buf += 128; + } + + if (ssize_t(size) >= -64) { + size += 128; + lo = combine512(lo, b512, l1); + goto fold_64_B_loop; + } + + const __m512i + b896 = _mm512_load_epi32(&tab.b896), + b384 = _mm512_load_epi32(&tab.b384); + + __m512i c4 = xor3_512(_mm512_clmulepi64_epi128(lo, b896, 1), + _mm512_clmulepi64_epi128(lo, b896, 0x10), + _mm512_clmulepi64_epi128(l1, b384, 1)); + c4 = xor3_512(c4, _mm512_clmulepi64_epi128(l1, b384, 0x10), + extract512_128<3>(l1)); + + __m256i c2 = _mm512_castsi512_si256(_mm512_shuffle_i64x2(c4, c4, 0b01001110)); + c2 = xor256(c2, _mm512_castsi512_si256(c4)); + crc_out = xor128(_mm256_extracti64x2_epi64(c2, 1), + _mm256_castsi256_si128(c2)); + size += 128 - 16; + goto final_reduction; + } + + __m128i b; + + // less_than_256 + if (size >= 32) { + if (size >= 64) { + lo = xor512(load512(buf), crc_in); + + while (buf += 64, (size -= 64) >= 64) + fold_64_B_loop: + lo = combine512(lo, b512, load512(buf)); + + // reduce_64B + const __m512i b384 = _mm512_load_epi32(&tab.b384); + __m512i crc512 = + xor3_512(_mm512_clmulepi64_epi128(lo, b384, 1), + _mm512_clmulepi64_epi128(lo, b384, 0x10), + extract512_128<3>(lo)); + crc512 = xor512(crc512, _mm512_shuffle_i64x2(crc512, crc512, 0b01001110)); + const __m256i crc256 = _mm512_castsi512_si256(crc512); + crc_out = xor128(_mm256_extracti64x2_epi64(crc256, 1), + _mm256_castsi256_si128(crc256)); + size -= 16; + } else { + // less_than_64 + crc_out = xor128(load128(buf), + _mm512_castsi512_si128(crc_in)); + buf += 16; + size -= 32; + } + + final_reduction: + b = _mm_load_epi32(&tab.b128); + + while (ssize_t(size) >= 0) { + // reduction_loop_16B + crc_out = xor3_128(load128(buf), + _mm_clmulepi64_si128(crc_out, b, 1), + _mm_clmulepi64_si128(crc_out, b, 0x10)); + buf += 16; + size -= 16; + } + // final_reduction_for_128 + + size += 16; + if (size) { + get_last_two_xmms: + const __m128i crc2 = crc_out, d = load128(buf + (size - 16)); + __m128i S = load128(reinterpret_cast<const char*>(shuffle128) + size); + crc_out = _mm_shuffle_epi8(crc_out, S); + S = xor128(S, _mm_set1_epi32(0x80808080)); + crc_out = xor3_128(_mm_blendv_epi8(_mm_shuffle_epi8(crc2, S), d, S), + _mm_clmulepi64_si128(crc_out, b, 1), + _mm_clmulepi64_si128(crc_out, b, 0x10)); + } + + done_128: + __m128i crc_tmp; + b = _mm_load_epi32(&tab.b64); + crc_tmp = xor128(_mm_clmulepi64_si128(crc_out, b, 0x00), + _mm_srli_si128(crc_out, 8)); + crc_out = _mm_slli_si128(crc_tmp, 4); + crc_out = _mm_clmulepi64_si128(crc_out, b, 0x10); + crc_out = xor128(crc_out, crc_tmp); + + barrett: + b = _mm_load_epi32(&tab.b32); + crc_tmp = crc_out; + crc_out = and128(crc_out, _mm_set_epi64x(~0ULL, ~0xFFFFFFFFULL)); + crc_out = _mm_clmulepi64_si128(crc_out, b, 0); + crc_out = xor2_and_128(crc_out, crc_tmp, _mm_set_epi64x(0, ~0ULL)); + crc_out = xnor3_128(crc_out, crc_tmp, + _mm_clmulepi64_si128(crc_out, b, 0x10)); + return _mm_extract_epi32(crc_out, 2); + } else { + // less_than_32 + if (size > 0) { + if (size > 16) { + crc_out = xor128(load128(buf), + _mm512_castsi512_si128(crc_in)); + buf += 16; + size -= 16; + b = _mm_load_epi32(&tab.b128); + goto get_last_two_xmms; + } else if (size < 16) { + crc_out = _mm_maskz_loadu_epi8(size_mask[size - 1], buf); + crc_out = xor128(crc_out, _mm512_castsi512_si128(crc_in)); + + if (size >= 4) { + crc_out = _mm_shuffle_epi8 + (crc_out, + load128(reinterpret_cast<const char*>(shift128) + size)); + goto done_128; + } else { + // only_less_than_4 + /* Shift, zero-filling 5 to 7 of the 8-byte crc_out */ + crc_out = _mm_shuffle_epi8(crc_out, + load128(shift_1_to_3_reflect + size - 1)); + goto barrett; + } + } else { + crc_out = xor128(load128(buf), _mm512_castsi512_si128(crc_in)); + goto done_128; + } + } else + return crc; + } +} + +static ATTRIBUTE_NOINLINE int have_vpclmulqdq() +{ +# ifdef _MSC_VER + int regs[4]; + __cpuidex(regs, 7, 0); + uint32_t ebx = regs[1], ecx = regs[2]; +# else + uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; + __cpuid_count(7, 0, eax, ebx, ecx, edx); +# endif + return ecx & 1U<<10/*VPCLMULQDQ*/ && + !(~ebx & ((1U<<16/*AVX512F*/ | 1U<<17/*AVX512DQ*/ | + 1U<<30/*AVX512BW*/ | 1U<<31/*AVX512VL*/))); +} + +static unsigned crc32_vpclmulqdq(unsigned crc, const void *buf, size_t size) +{ + return crc32_avx512(crc, static_cast<const char*>(buf), size, refl32); +} + +static unsigned crc32c_vpclmulqdq(unsigned crc, const void *buf, size_t size) +{ + return crc32_avx512(crc, static_cast<const char*>(buf), size, refl32c); +} +#endif + +extern "C" my_crc32_t crc32_pclmul_enabled(void) +{ + if (~cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) + return nullptr; +#ifdef USE_VPCLMULQDQ + if (have_vpclmulqdq()) + return crc32_vpclmulqdq; +#endif + return crc32_pclmul; +} + +extern "C" my_crc32_t crc32c_x86_available(void) +{ +#ifdef USE_VPCLMULQDQ + if (have_vpclmulqdq()) + return crc32c_vpclmulqdq; +#endif +#if SIZEOF_SIZE_T == 8 + switch (cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) { + case cpuid_ecx_SSE42_AND_PCLMUL: + return crc32c_3way; + case cpuid_ecx_SSE42: + return crc32c_sse42; + } +#else + if (cpuid_ecx() & cpuid_ecx_SSE42) + return crc32c_sse42; +#endif + return nullptr; +} + +extern "C" const char *crc32c_x86_impl(my_crc32_t c) +{ +#ifdef USE_VPCLMULQDQ + if (c == crc32c_vpclmulqdq) + return "Using AVX512 instructions"; +#endif +#if SIZEOF_SIZE_T == 8 + if (c == crc32c_3way) + return "Using crc32 + pclmulqdq instructions"; +#endif + if (c == crc32c_sse42) + return "Using SSE4.2 crc32 instructions"; + return nullptr; +} diff --git a/mysys/crc32/crc_ppc64.h b/mysys/crc32/crc_ppc64.h index eb9379ab..81bbc16d 100644 --- a/mysys/crc32/crc_ppc64.h +++ b/mysys/crc32/crc_ppc64.h @@ -28,7 +28,7 @@ * any later version, or * b) the Apache License, Version 2.0 */ - +#include <stddef.h> #include <altivec.h> @@ -57,12 +57,13 @@ static unsigned int __attribute__ ((aligned (32))) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len); -unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p, - unsigned long len) +unsigned CRC32_FUNCTION(unsigned crc, const void *buffer, size_t len) { unsigned int prealign; unsigned int tail; + const unsigned char *p = buffer; + #ifdef CRC_XOR crc ^= 0xffffffff; #endif |