/* Copyright (c) 2024, MariaDB plc This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ #include #include #include #ifdef _MSC_VER # include # if 0 /* So far, we have no environment where this could be tested. */ # define USE_VPCLMULQDQ /* nothing */ # endif #else # include # if __GNUC__ >= 11 || (defined __clang_major__ && __clang_major__ >= 8) # define TARGET "pclmul,avx512f,avx512dq,avx512bw,avx512vl,vpclmulqdq" # define USE_VPCLMULQDQ __attribute__((target(TARGET))) # endif #endif extern "C" unsigned crc32c_sse42(unsigned crc, const void* buf, size_t size); constexpr uint32_t cpuid_ecx_SSE42= 1U << 20; constexpr uint32_t cpuid_ecx_SSE42_AND_PCLMUL= cpuid_ecx_SSE42 | 1U << 1; static uint32_t cpuid_ecx() { #ifdef __GNUC__ uint32_t reax= 0, rebx= 0, recx= 0, redx= 0; __cpuid(1, reax, rebx, recx, redx); return recx; #elif defined _MSC_VER int regs[4]; __cpuid(regs, 1); return regs[2]; #else # error "unknown compiler" #endif } typedef unsigned (*my_crc32_t)(unsigned, const void *, size_t); extern "C" unsigned int crc32_pclmul(unsigned int, const void *, size_t); extern "C" unsigned int crc32c_3way(unsigned int, const void *, size_t); #ifdef USE_VPCLMULQDQ # include # ifdef _MSC_VER /* MSVC does not seem to define this intrinsic for vmovdqa */ # define _mm_load_epi32(x) *reinterpret_cast(x) # endif /* This implementation is based on crc32_by16_vclmul_avx512 and crc32_refl_by16_vclmul_avx512 in https://github.com/intel/intel-ipsec-mb/ with some optimizations. The // comments in crc32_avx512() correspond to assembler labels. */ /** table of constants corresponding to a CRC polynomial up to degree 32 */ struct alignas(64) crc32_tab { const uint64_t b2048[2], b1024[2]; alignas(64) const uint64_t b896[6]; /* includes b786, b640 */ const uint64_t b512[2]; const uint64_t b384[2], b256[2], b128[2], zeropad_for_b384[2]; const uint64_t b64[2], b32[2]; }; /** ISO 3309 CRC-32 (reflected polynomial 0x04C11DB7); zlib crc32() */ static const crc32_tab refl32 = { { 0x00000000e95c1271, 0x00000000ce3371cb }, { 0x00000000910eeec1, 0x0000000033fff533 }, { 0x000000000cbec0ed, 0x0000000031f8303f, 0x0000000057c54819, 0x00000000df068dc2, 0x00000000ae0b5394, 0x000000001c279815 }, { 0x000000001d9513d7, 0x000000008f352d95 }, { 0x00000000af449247, 0x000000003db1ecdc }, { 0x0000000081256527, 0x00000000f1da05aa }, { 0x00000000ccaa009e, 0x00000000ae689191 }, { 0, 0 }, { 0x00000000ccaa009e, 0x00000000b8bc6765 }, { 0x00000001f7011640, 0x00000001db710640 } }; /** Castagnoli CRC-32C (reflected polynomial 0x1EDC6F41) */ static const crc32_tab refl32c = { { 0x00000000b9e02b86, 0x00000000dcb17aa4 }, { 0x000000000d3b6092, 0x000000006992cea2 }, { 0x0000000047db8317, 0x000000002ad91c30, 0x000000000715ce53, 0x00000000c49f4f67, 0x0000000039d3b296, 0x00000000083a6eec }, { 0x000000009e4addf8, 0x00000000740eef02 }, { 0x00000000ddc0152b, 0x000000001c291d04 }, { 0x00000000ba4fc28e, 0x000000003da6d0cb }, { 0x00000000493c7d27, 0x00000000f20c0dfe }, { 0, 0 }, { 0x00000000493c7d27, 0x00000000dd45aab8 }, { 0x00000000dea713f0, 0x0000000105ec76f0 } }; /** Some ternary functions */ class ternary { static constexpr uint8_t A = 0b11110000; static constexpr uint8_t B = 0b11001100; static constexpr uint8_t C = 0b10101010; public: static constexpr uint8_t XOR3 = A ^ B ^ C; static constexpr uint8_t XNOR3 = uint8_t(~(A ^ B ^ C)); static constexpr uint8_t XOR2_AND = (A ^ B) & C; }; USE_VPCLMULQDQ /** @return a^b^c */ static inline __m128i xor3_128(__m128i a, __m128i b, __m128i c) { return _mm_ternarylogic_epi64(a, b, c, ternary::XOR3); } USE_VPCLMULQDQ /** @return ~(a^b^c) */ static inline __m128i xnor3_128(__m128i a, __m128i b, __m128i c) { return _mm_ternarylogic_epi64(a, b, c, ternary::XNOR3); } USE_VPCLMULQDQ /** @return a^b^c */ static inline __m512i xor3_512(__m512i a, __m512i b, __m512i c) { return _mm512_ternarylogic_epi64(a, b, c, ternary::XOR3); } USE_VPCLMULQDQ /** @return (a^b)&c */ static inline __m128i xor2_and_128(__m128i a, __m128i b, __m128i c) { return _mm_ternarylogic_epi64(a, b, c, ternary::XOR2_AND); } USE_VPCLMULQDQ /** Load 64 bytes */ static inline __m512i load512(const char *b) { return _mm512_loadu_epi8(b); } USE_VPCLMULQDQ /** Load 16 bytes */ static inline __m128i load128(const char *b) { return _mm_loadu_epi64(b); } /** Combine 512 data bits with CRC */ USE_VPCLMULQDQ static inline __m512i combine512(__m512i a, __m512i tab, __m512i b) { return xor3_512(b, _mm512_clmulepi64_epi128(a, tab, 0x01), _mm512_clmulepi64_epi128(a, tab, 0x10)); } # define xor512(a, b) _mm512_xor_epi64(a, b) # define xor256(a, b) _mm256_xor_epi64(a, b) # define xor128(a, b) _mm_xor_epi64(a, b) # define and128(a, b) _mm_and_si128(a, b) template USE_VPCLMULQDQ /** Pick a 128-bit component of a 512-bit vector */ static inline __m512i extract512_128(__m512i a) { static_assert(bits <= 3, "usage"); # if defined __GNUC__ && __GNUC__ >= 11 /* While technically incorrect, this would seem to translate into a vextracti32x4 instruction, which actually outputs a ZMM register (anything above the XMM range is cleared). */ return _mm512_castsi128_si512(_mm512_extracti64x2_epi64(a, bits)); # else /* On clang, this is needed in order to get a correct result. */ return _mm512_maskz_shuffle_i64x2(3, a, a, bits); # endif } alignas(16) static const uint64_t shuffle128[4] = { 0x8786858483828100, 0x8f8e8d8c8b8a8988, 0x0706050403020100, 0x000e0d0c0b0a0908 }; static const __mmask16 size_mask[16] = { 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff }; alignas(16) static const uint64_t shift128[4] = { 0x8786858483828100, 0x8f8e8d8c8b8a8988, 0x0706050403020100, 0x000e0d0c0b0a0908 }; static const char shift_1_to_3_reflect[7 + 11] = { -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; USE_VPCLMULQDQ static unsigned crc32_avx512(unsigned crc, const char *buf, size_t size, const crc32_tab &tab) { const __m512i crc_in = _mm512_castsi128_si512(_mm_cvtsi32_si128(~crc)), b512 = _mm512_broadcast_i32x4(_mm_load_epi32(tab.b512)); __m128i crc_out; __m512i lo; if (size >= 256) { lo = xor512(load512(buf), crc_in); __m512i l1 = load512(buf + 64); const __m512i b1024 = _mm512_broadcast_i32x4(_mm_load_epi32(&tab.b1024)); size -= 256; if (size >= 256) { __m512i h0 = load512(buf + 128), hi = load512(buf + 192); const __m512i b2048 = _mm512_broadcast_i32x4(_mm_load_epi32(&tab.b2048)); size -= 256; do { buf += 256; lo = combine512(lo, b2048, load512(buf)); l1 = combine512(l1, b2048, load512(buf + 64)); h0 = combine512(h0, b2048, load512(buf + 128)); hi = combine512(hi, b2048, load512(buf + 192)); size -= 256; } while (ssize_t(size) >= 0); buf += 256; lo = combine512(lo, b1024, h0); l1 = combine512(l1, b1024, hi); size += 128; } else { do { buf += 128; lo = combine512(lo, b1024, load512(buf)); l1 = combine512(l1, b1024, load512(buf + 64)); size -= 128; } while (ssize_t(size) >= 0); buf += 128; } if (ssize_t(size) >= -64) { size += 128; lo = combine512(lo, b512, l1); goto fold_64_B_loop; } const __m512i b896 = _mm512_load_epi32(&tab.b896), b384 = _mm512_load_epi32(&tab.b384); __m512i c4 = xor3_512(_mm512_clmulepi64_epi128(lo, b896, 1), _mm512_clmulepi64_epi128(lo, b896, 0x10), _mm512_clmulepi64_epi128(l1, b384, 1)); c4 = xor3_512(c4, _mm512_clmulepi64_epi128(l1, b384, 0x10), extract512_128<3>(l1)); __m256i c2 = _mm512_castsi512_si256(_mm512_shuffle_i64x2(c4, c4, 0b01001110)); c2 = xor256(c2, _mm512_castsi512_si256(c4)); crc_out = xor128(_mm256_extracti64x2_epi64(c2, 1), _mm256_castsi256_si128(c2)); size += 128 - 16; goto final_reduction; } __m128i b; // less_than_256 if (size >= 32) { if (size >= 64) { lo = xor512(load512(buf), crc_in); while (buf += 64, (size -= 64) >= 64) fold_64_B_loop: lo = combine512(lo, b512, load512(buf)); // reduce_64B const __m512i b384 = _mm512_load_epi32(&tab.b384); __m512i crc512 = xor3_512(_mm512_clmulepi64_epi128(lo, b384, 1), _mm512_clmulepi64_epi128(lo, b384, 0x10), extract512_128<3>(lo)); crc512 = xor512(crc512, _mm512_shuffle_i64x2(crc512, crc512, 0b01001110)); const __m256i crc256 = _mm512_castsi512_si256(crc512); crc_out = xor128(_mm256_extracti64x2_epi64(crc256, 1), _mm256_castsi256_si128(crc256)); size -= 16; } else { // less_than_64 crc_out = xor128(load128(buf), _mm512_castsi512_si128(crc_in)); buf += 16; size -= 32; } final_reduction: b = _mm_load_epi32(&tab.b128); while (ssize_t(size) >= 0) { // reduction_loop_16B crc_out = xor3_128(load128(buf), _mm_clmulepi64_si128(crc_out, b, 1), _mm_clmulepi64_si128(crc_out, b, 0x10)); buf += 16; size -= 16; } // final_reduction_for_128 size += 16; if (size) { get_last_two_xmms: const __m128i crc2 = crc_out, d = load128(buf + (size - 16)); __m128i S = load128(reinterpret_cast(shuffle128) + size); crc_out = _mm_shuffle_epi8(crc_out, S); S = xor128(S, _mm_set1_epi32(0x80808080)); crc_out = xor3_128(_mm_blendv_epi8(_mm_shuffle_epi8(crc2, S), d, S), _mm_clmulepi64_si128(crc_out, b, 1), _mm_clmulepi64_si128(crc_out, b, 0x10)); } done_128: __m128i crc_tmp; b = _mm_load_epi32(&tab.b64); crc_tmp = xor128(_mm_clmulepi64_si128(crc_out, b, 0x00), _mm_srli_si128(crc_out, 8)); crc_out = _mm_slli_si128(crc_tmp, 4); crc_out = _mm_clmulepi64_si128(crc_out, b, 0x10); crc_out = xor128(crc_out, crc_tmp); barrett: b = _mm_load_epi32(&tab.b32); crc_tmp = crc_out; crc_out = and128(crc_out, _mm_set_epi64x(~0ULL, ~0xFFFFFFFFULL)); crc_out = _mm_clmulepi64_si128(crc_out, b, 0); crc_out = xor2_and_128(crc_out, crc_tmp, _mm_set_epi64x(0, ~0ULL)); crc_out = xnor3_128(crc_out, crc_tmp, _mm_clmulepi64_si128(crc_out, b, 0x10)); return _mm_extract_epi32(crc_out, 2); } else { // less_than_32 if (size > 0) { if (size > 16) { crc_out = xor128(load128(buf), _mm512_castsi512_si128(crc_in)); buf += 16; size -= 16; b = _mm_load_epi32(&tab.b128); goto get_last_two_xmms; } else if (size < 16) { crc_out = _mm_maskz_loadu_epi8(size_mask[size - 1], buf); crc_out = xor128(crc_out, _mm512_castsi512_si128(crc_in)); if (size >= 4) { crc_out = _mm_shuffle_epi8 (crc_out, load128(reinterpret_cast(shift128) + size)); goto done_128; } else { // only_less_than_4 /* Shift, zero-filling 5 to 7 of the 8-byte crc_out */ crc_out = _mm_shuffle_epi8(crc_out, load128(shift_1_to_3_reflect + size - 1)); goto barrett; } } else { crc_out = xor128(load128(buf), _mm512_castsi512_si128(crc_in)); goto done_128; } } else return crc; } } static ATTRIBUTE_NOINLINE int have_vpclmulqdq() { # ifdef _MSC_VER int regs[4]; __cpuidex(regs, 7, 0); uint32_t ebx = regs[1], ecx = regs[2]; # else uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; __cpuid_count(7, 0, eax, ebx, ecx, edx); # endif return ecx & 1U<<10/*VPCLMULQDQ*/ && !(~ebx & ((1U<<16/*AVX512F*/ | 1U<<17/*AVX512DQ*/ | 1U<<30/*AVX512BW*/ | 1U<<31/*AVX512VL*/))); } static unsigned crc32_vpclmulqdq(unsigned crc, const void *buf, size_t size) { return crc32_avx512(crc, static_cast(buf), size, refl32); } static unsigned crc32c_vpclmulqdq(unsigned crc, const void *buf, size_t size) { return crc32_avx512(crc, static_cast(buf), size, refl32c); } #endif extern "C" my_crc32_t crc32_pclmul_enabled(void) { if (~cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) return nullptr; #ifdef USE_VPCLMULQDQ if (have_vpclmulqdq()) return crc32_vpclmulqdq; #endif return crc32_pclmul; } extern "C" my_crc32_t crc32c_x86_available(void) { #ifdef USE_VPCLMULQDQ if (have_vpclmulqdq()) return crc32c_vpclmulqdq; #endif #if SIZEOF_SIZE_T == 8 switch (cpuid_ecx() & cpuid_ecx_SSE42_AND_PCLMUL) { case cpuid_ecx_SSE42_AND_PCLMUL: return crc32c_3way; case cpuid_ecx_SSE42: return crc32c_sse42; } #else if (cpuid_ecx() & cpuid_ecx_SSE42) return crc32c_sse42; #endif return nullptr; } extern "C" const char *crc32c_x86_impl(my_crc32_t c) { #ifdef USE_VPCLMULQDQ if (c == crc32c_vpclmulqdq) return "Using AVX512 instructions"; #endif #if SIZEOF_SIZE_T == 8 if (c == crc32c_3way) return "Using crc32 + pclmulqdq instructions"; #endif if (c == crc32c_sse42) return "Using SSE4.2 crc32 instructions"; return nullptr; }