209 lines
7.5 KiB
C
209 lines
7.5 KiB
C
/* crc-x86_64-pclmul.c -- CRC32 implementation for x86_64 using SSE/AVX1
|
|
Copyright (C) 2024-2025 Free Software Foundation, Inc.
|
|
|
|
This file is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Lesser General Public License as
|
|
published by the Free Software Foundation, either version 3 of the
|
|
License, or (at your option) any later version.
|
|
|
|
This file is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
|
|
|
/* Written by Sam Russell. */
|
|
|
|
#include <config.h>
|
|
|
|
/* Specification. */
|
|
#include "crc-x86_64.h"
|
|
|
|
#include <string.h>
|
|
#include <x86intrin.h>
|
|
|
|
#include "crc.h"
|
|
|
|
#if defined __GNUC__ || defined __clang__
|
|
__attribute__ ((__target__ ("pclmul,avx")))
|
|
#endif
|
|
uint32_t
|
|
crc32_update_no_xor_pclmul (uint32_t crc, const void *buf, size_t len)
|
|
{
|
|
const __m128i_u *data = buf;
|
|
__m128i *datarw;
|
|
size_t bytes_remaining = len;
|
|
__m128i in256[4] = { 0 };
|
|
__m128i xor_crc;
|
|
__m128i in1 = _mm_setzero_si128 ();
|
|
__m128i in2 = _mm_setzero_si128 ();
|
|
__m128i in3 = _mm_setzero_si128 ();
|
|
__m128i in4 = _mm_setzero_si128 ();
|
|
__m128i in5 = _mm_setzero_si128 ();
|
|
__m128i in6 = _mm_setzero_si128 ();
|
|
__m128i in7 = _mm_setzero_si128 ();
|
|
__m128i in8 = _mm_setzero_si128 ();
|
|
__m128i final_buf[12] = { 0 };
|
|
__m128i fold_high;
|
|
__m128i fold_low;
|
|
|
|
/* These constants are calculated as T-1 mod P
|
|
0x8F352D95 = (544 - 1) mod P etc
|
|
This method is described in Fast CRC Computation for Generic Polynomials
|
|
Using PCLMULQDQ Instruction (Gopal et al.) */
|
|
__m128i shift544_shift480 = _mm_set_epi64x (0x1D9513D7, 0x8F352D95);
|
|
__m128i shift160_shift96 = _mm_set_epi64x (0xCCAA009E, 0xAE689191);
|
|
__m128i shift96_shift64 = _mm_set_epi64x (0xB8BC6765, 0xCCAA009E);
|
|
/* Mu is defined as x^64 / P(x)
|
|
P(x) is the generator polynomial for this CRC32 implementation */
|
|
__m128i mu_poly = _mm_set_epi64x (0x1DB710641, 0x1F7011641);
|
|
|
|
if (bytes_remaining >= 128)
|
|
{
|
|
/* Here we fold 4x 128bit words forward by 512 bits */
|
|
/* First load up our initial state */
|
|
in1 = _mm_loadu_si128 (data);
|
|
in2 = _mm_loadu_si128 (data + 1);
|
|
in3 = _mm_loadu_si128 (data + 2);
|
|
in4 = _mm_loadu_si128 (data + 3);
|
|
|
|
/* Initialise with incoming CRC */
|
|
xor_crc = _mm_set_epi32 (0, 0, 0, crc);
|
|
in1 = _mm_xor_si128 (in1, xor_crc);
|
|
|
|
while (bytes_remaining >= 128)
|
|
{
|
|
/* Load up second set of 512 bits */
|
|
in5 = _mm_loadu_si128 (data + 4);
|
|
in6 = _mm_loadu_si128 (data + 5);
|
|
in7 = _mm_loadu_si128 (data + 6);
|
|
in8 = _mm_loadu_si128 (data + 7);
|
|
|
|
/* We shift the high QWORD forward 544 bits and the
|
|
low QWORD 480 bits */
|
|
fold_high = _mm_clmulepi64_si128 (in1, shift544_shift480, 0x11);
|
|
fold_low = _mm_clmulepi64_si128 (in1, shift544_shift480, 0x00);
|
|
in1 = _mm_xor_si128 (in5, fold_high);
|
|
in1 = _mm_xor_si128 (in1, fold_low);
|
|
fold_high = _mm_clmulepi64_si128 (in2, shift544_shift480, 0x11);
|
|
fold_low = _mm_clmulepi64_si128 (in2, shift544_shift480, 0x00);
|
|
in2 = _mm_xor_si128 (in6, fold_high);
|
|
in2 = _mm_xor_si128 (in2, fold_low);
|
|
fold_high = _mm_clmulepi64_si128 (in3, shift544_shift480, 0x11);
|
|
fold_low = _mm_clmulepi64_si128 (in3, shift544_shift480, 0x00);
|
|
in3 = _mm_xor_si128 (in7, fold_high);
|
|
in3 = _mm_xor_si128 (in3, fold_low);
|
|
fold_high = _mm_clmulepi64_si128 (in4, shift544_shift480, 0x11);
|
|
fold_low = _mm_clmulepi64_si128 (in4, shift544_shift480, 0x00);
|
|
in4 = _mm_xor_si128 (in8, fold_high);
|
|
in4 = _mm_xor_si128 (in4, fold_low);
|
|
|
|
bytes_remaining -= 64;
|
|
data += 4;
|
|
}
|
|
|
|
_mm_storeu_si128 (final_buf, in1);
|
|
_mm_storeu_si128 (final_buf + 1, in2);
|
|
_mm_storeu_si128 (final_buf + 2, in3);
|
|
_mm_storeu_si128 (final_buf + 3, in4);
|
|
|
|
/* Move everything to final_buf because it is RW */
|
|
memcpy (final_buf + 4, data + 4, bytes_remaining - 64);
|
|
datarw = final_buf;
|
|
}
|
|
else
|
|
{
|
|
/* Move everything to final_buf because it is RW */
|
|
/* XOR in previous CRC value */
|
|
memcpy (final_buf, data, bytes_remaining);
|
|
in1 = _mm_loadu_si128 (final_buf);
|
|
xor_crc = _mm_set_epi32 (0, 0, 0, crc);
|
|
in1 = _mm_xor_si128 (in1, xor_crc);
|
|
_mm_storeu_si128 (final_buf, in1);
|
|
datarw = final_buf;
|
|
}
|
|
|
|
while (bytes_remaining >= 32)
|
|
{
|
|
/* Do 128-bit folds as above */
|
|
in1 = _mm_loadu_si128 (datarw);
|
|
in2 = _mm_loadu_si128 (datarw + 1);
|
|
|
|
/* We shift the high QWORD forward 160 bits and the
|
|
low QWORD 96 bits */
|
|
fold_high = _mm_clmulepi64_si128 (in1, shift160_shift96, 0x11);
|
|
fold_low = _mm_clmulepi64_si128 (in1, shift160_shift96, 0x00);
|
|
in2 = _mm_xor_si128 (in2, fold_high);
|
|
in2 = _mm_xor_si128 (in2, fold_low);
|
|
|
|
_mm_storeu_si128 (datarw + 1, in2);
|
|
bytes_remaining -= 16;
|
|
datarw += 1;
|
|
}
|
|
|
|
/* We have 16-31 bytes here
|
|
If we have 17-31 then we do another special case 128-bit fold
|
|
the padding trick works because we're effectively padding 0s on the front
|
|
which means with little endian we're shifting the number higher
|
|
we'll also use this step to pick the <16 byte case */
|
|
|
|
if (bytes_remaining != 16)
|
|
{
|
|
/* Pad remainder and fold 128-bits
|
|
We're reading in up to 32 bytes here = 256 bits
|
|
This is inefficient so we only actually want to hit this on the actual end of data
|
|
If we're reading a multiple of 32 bytes in the loop then this will never get hit */
|
|
|
|
/* Read in at an offset so we get the shift for free */
|
|
memcpy (((char *) in256) + (32 - bytes_remaining), datarw,
|
|
bytes_remaining);
|
|
in1 = _mm_loadu_si128 (in256);
|
|
in2 = _mm_loadu_si128 (in256 + 1);
|
|
|
|
/* Now we fold in1 onto in2 */
|
|
shift160_shift96 = _mm_set_epi64x (0x0ccaa009e, 0x1751997d0);
|
|
fold_high = _mm_clmulepi64_si128 (in1, shift160_shift96, 0x11);
|
|
fold_low = _mm_clmulepi64_si128 (in1, shift160_shift96, 0x00);
|
|
in2 = _mm_xor_si128 (in2, fold_high);
|
|
in1 = _mm_xor_si128 (in2, fold_low);
|
|
}
|
|
else
|
|
in1 = _mm_loadu_si128 (datarw);
|
|
|
|
/* We now have 16 bytes and fold as normal */
|
|
|
|
in2 =
|
|
_mm_and_si128 (_mm_srli_si128 (in1, 8), _mm_set_epi64x (0, 0xffffffff));
|
|
in3 =
|
|
_mm_and_si128 (_mm_srli_si128 (in1, 12), _mm_set_epi64x (0, 0xffffffff));
|
|
in1 = _mm_and_si128 (in1, _mm_set_epi64x (0, 0xffffffffffffffff));
|
|
/* Multiply first 64 bits against shift96 */
|
|
in1 = _mm_clmulepi64_si128 (shift96_shift64, in1, 0x00);
|
|
/* First 32 bits go on in2 */
|
|
in2 =
|
|
_mm_xor_si128 (in2, _mm_and_si128 (in1, _mm_set_epi64x (0, 0xffffffff)));
|
|
/* Next 64 bits go on in3 */
|
|
in3 = _mm_xor_si128 (in3, _mm_srli_si128 (in1, 4));
|
|
/* Then shift 64 bits from here */
|
|
in1 = _mm_clmulepi64_si128 (shift96_shift64, in2, 0x01);
|
|
in1 = _mm_xor_si128 (in1, in3);
|
|
|
|
/* This is the Barrett reduction */
|
|
/* Take the bottom 32 bits */
|
|
in2 = _mm_and_si128 (in1, _mm_set_epi64x (0, 0xffffffff));
|
|
/* Multiply by mu */
|
|
in2 = _mm_clmulepi64_si128 (mu_poly, in2, 0x00);
|
|
/* Take the bottom 32 bits of the result */
|
|
in2 = _mm_and_si128 (in2, _mm_set_epi64x (0, 0xffffffff));
|
|
/* Multiply by P(x) */
|
|
in2 = _mm_clmulepi64_si128 (mu_poly, in2, 0x01);
|
|
/* XOR against input */
|
|
in1 = _mm_xor_si128 (in1, in2);
|
|
/* Take bits 32-63 as the CRC */
|
|
in1 = _mm_srli_si128 (in1, 4);
|
|
crc = _mm_cvtsi128_si32 (in1);
|
|
|
|
return crc;
|
|
}
|