diff options
Diffstat (limited to 'comm/third_party/libgcrypt/cipher/crc-intel-pclmul.c')
-rw-r--r-- | comm/third_party/libgcrypt/cipher/crc-intel-pclmul.c | 939 |
1 files changed, 939 insertions, 0 deletions
diff --git a/comm/third_party/libgcrypt/cipher/crc-intel-pclmul.c b/comm/third_party/libgcrypt/cipher/crc-intel-pclmul.c new file mode 100644 index 0000000000..8c8b1915ab --- /dev/null +++ b/comm/third_party/libgcrypt/cipher/crc-intel-pclmul.c @@ -0,0 +1,939 @@ +/* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation + * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + */ + +#include <config.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "g10lib.h" + +#include "bithelp.h" +#include "bufhelp.h" + + +#if defined(ENABLE_PCLMUL_SUPPORT) && defined(ENABLE_SSE41_SUPPORT) && \ + __GNUC__ >= 4 && \ + ((defined(__i386__) && SIZEOF_UNSIGNED_LONG == 4) || defined(__x86_64__)) + + +#if _GCRY_GCC_VERSION >= 40400 /* 4.4 */ +/* Prevent compiler from issuing SSE instructions between asm blocks. */ +# pragma GCC target("no-sse") +#endif +#if __clang__ +# pragma clang attribute push (__attribute__((target("no-sse"))), apply_to = function) +#endif + + +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) + +#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION +#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE + + +#define ALIGNED_16 __attribute__ ((aligned (16))) + + +struct u16_unaligned_s +{ + u16 a; +} __attribute__((packed, aligned (1), may_alias)); + + +/* Constants structure for generic reflected/non-reflected CRC32 CLMUL + * functions. */ +struct crc32_consts_s +{ + /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */ + u64 k[6]; + /* my_p: { floor(x^64 / P(x)), P(x) } */ + u64 my_p[2]; +}; + + +/* CLMUL constants for CRC32 and CRC32RFC1510. */ +static const struct crc32_consts_s crc32_consts ALIGNED_16 = +{ + { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */ + U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */ + U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */ + U64_C(0x163cd6124), 0 /* y = 2 */ + }, + { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */ + U64_C(0x1f7011641), U64_C(0x1db710641) + } +}; + +/* CLMUL constants for CRC24RFC2440 (polynomial multiplied with x⁸). */ +static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 = +{ + { /* k[6] = x^(32*y) mod P(x) << 32*/ + U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */ + U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */ + U64_C(0xd9fe8c00) << 32, 0 /* y = 2 */ + }, + { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */ + U64_C(0x1f845fe24), U64_C(0x1864cfb00) + } +}; + +/* Common constants for CRC32 algorithms. */ +static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 = + { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; +static const byte crc32_shuf_shift[3 * 16] ALIGNED_16 = + { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; +static const byte *crc32_bswap_shuf = &crc32_shuf_shift[16]; +static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 = + { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; +static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 = + { + { U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f) }, /* 9 */ + { U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e) }, + { U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d) }, + { U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c) }, + { U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b) }, + { U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a) }, + { U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09) }, /* 15 */ + }; +static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 = + { + { U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff) }, /* 5 */ + { U64_C(0xffff070603020100), U64_C(0xffffffffffffffff) }, + { U64_C(0xff07060503020100), U64_C(0xffffffffffffffff) }, /* 7 */ + }; + +/* PCLMUL functions for reflected CRC32. */ +static ASM_FUNC_ATTR_INLINE void +crc32_reflected_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts) +{ + if (inlen >= 8 * 16) + { + asm volatile ("movd %[crc], %%xmm4\n\t" + "movdqu %[inbuf_0], %%xmm0\n\t" + "movdqu %[inbuf_1], %%xmm1\n\t" + "movdqu %[inbuf_2], %%xmm2\n\t" + "movdqu %[inbuf_3], %%xmm3\n\t" + "pxor %%xmm4, %%xmm0\n\t" + : + : [inbuf_0] "m" (inbuf[0 * 16]), + [inbuf_1] "m" (inbuf[1 * 16]), + [inbuf_2] "m" (inbuf[2 * 16]), + [inbuf_3] "m" (inbuf[3 * 16]), + [crc] "m" (*pcrc) + ); + + inbuf += 4 * 16; + inlen -= 4 * 16; + + asm volatile ("movdqa %[k1k2], %%xmm4\n\t" + : + : [k1k2] "m" (consts->k[1 - 1]) + ); + + /* Fold by 4. */ + while (inlen >= 4 * 16) + { + asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t" + "movdqa %%xmm0, %%xmm6\n\t" + "pclmulqdq $0x00, %%xmm4, %%xmm0\n\t" + "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm0\n\t" + "pxor %%xmm6, %%xmm0\n\t" + + "movdqu %[inbuf_1], %%xmm5\n\t" + "movdqa %%xmm1, %%xmm6\n\t" + "pclmulqdq $0x00, %%xmm4, %%xmm1\n\t" + "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "pxor %%xmm6, %%xmm1\n\t" + + "movdqu %[inbuf_2], %%xmm5\n\t" + "movdqa %%xmm2, %%xmm6\n\t" + "pclmulqdq $0x00, %%xmm4, %%xmm2\n\t" + "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "pxor %%xmm6, %%xmm2\n\t" + + "movdqu %[inbuf_3], %%xmm5\n\t" + "movdqa %%xmm3, %%xmm6\n\t" + "pclmulqdq $0x00, %%xmm4, %%xmm3\n\t" + "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "pxor %%xmm6, %%xmm3\n\t" + : + : [inbuf_0] "m" (inbuf[0 * 16]), + [inbuf_1] "m" (inbuf[1 * 16]), + [inbuf_2] "m" (inbuf[2 * 16]), + [inbuf_3] "m" (inbuf[3 * 16]) + ); + + inbuf += 4 * 16; + inlen -= 4 * 16; + } + + asm volatile ("movdqa %[k3k4], %%xmm6\n\t" + "movdqa %[my_p], %%xmm5\n\t" + : + : [k3k4] "m" (consts->k[3 - 1]), + [my_p] "m" (consts->my_p[0]) + ); + + /* Fold 4 to 1. */ + + asm volatile ("movdqa %%xmm0, %%xmm4\n\t" + "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" + "pxor %%xmm1, %%xmm0\n\t" + "pxor %%xmm4, %%xmm0\n\t" + + "movdqa %%xmm0, %%xmm4\n\t" + "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" + "pxor %%xmm2, %%xmm0\n\t" + "pxor %%xmm4, %%xmm0\n\t" + + "movdqa %%xmm0, %%xmm4\n\t" + "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t" + "pxor %%xmm3, %%xmm0\n\t" + "pxor %%xmm4, %%xmm0\n\t" + : + : + ); + } + else + { + asm volatile ("movd %[crc], %%xmm1\n\t" + "movdqu %[inbuf], %%xmm0\n\t" + "movdqa %[k3k4], %%xmm6\n\t" + "pxor %%xmm1, %%xmm0\n\t" + "movdqa %[my_p], %%xmm5\n\t" + : + : [inbuf] "m" (*inbuf), + [crc] "m" (*pcrc), + [k3k4] "m" (consts->k[3 - 1]), + [my_p] "m" (consts->my_p[0]) + ); + + inbuf += 16; + inlen -= 16; + } + + /* Fold by 1. */ + if (inlen >= 16) + { + while (inlen >= 16) + { + /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */ + asm volatile ("movdqu %[inbuf], %%xmm2\n\t" + "movdqa %%xmm0, %%xmm1\n\t" + "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t" + "pxor %%xmm2, %%xmm0\n\t" + "pxor %%xmm1, %%xmm0\n\t" + : + : [inbuf] "m" (*inbuf) + ); + + inbuf += 16; + inlen -= 16; + } + } + + /* Partial fold. */ + if (inlen) + { + /* Load last input and add padding zeros. */ + asm volatile ("movdqu %[shr_shuf], %%xmm3\n\t" + "movdqu %[shl_shuf], %%xmm4\n\t" + "movdqu %[mask], %%xmm2\n\t" + + "movdqa %%xmm0, %%xmm1\n\t" + "pshufb %%xmm4, %%xmm0\n\t" + "movdqu %[inbuf], %%xmm4\n\t" + "pshufb %%xmm3, %%xmm1\n\t" + "pand %%xmm4, %%xmm2\n\t" + "por %%xmm1, %%xmm2\n\t" + + "movdqa %%xmm0, %%xmm1\n\t" + "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t" + "pxor %%xmm2, %%xmm0\n\t" + "pxor %%xmm1, %%xmm0\n\t" + : + : [inbuf] "m" (*(inbuf - 16 + inlen)), + [mask] "m" (crc32_partial_fold_input_mask[inlen]), + [shl_shuf] "m" (crc32_refl_shuf_shift[inlen]), + [shr_shuf] "m" (crc32_refl_shuf_shift[inlen + 16]) + ); + + inbuf += inlen; + inlen -= inlen; + } + + /* Final fold. */ + asm volatile (/* reduce 128-bits to 96-bits */ + "movdqa %%xmm0, %%xmm1\n\t" + "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t" + "psrldq $8, %%xmm1\n\t" + "pxor %%xmm1, %%xmm0\n\t" + + /* reduce 96-bits to 64-bits */ + "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */ + "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */ + "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */ + "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */ + + /* barrett reduction */ + "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */ + "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */ + "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ + "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ + "pxor %%xmm1, %%xmm0\n\t" + + /* store CRC */ + "pextrd $2, %%xmm0, %[out]\n\t" + : [out] "=m" (*pcrc) + : [k5] "m" (consts->k[5 - 1]) + ); +} + +static ASM_FUNC_ATTR_INLINE void +crc32_reflected_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts) +{ + if (inlen < 4) + { + u32 crc = *pcrc; + u32 data; + + asm volatile ("movdqa %[my_p], %%xmm5\n\t" + : + : [my_p] "m" (consts->my_p[0]) + ); + + if (inlen == 1) + { + data = inbuf[0]; + data ^= crc; + data <<= 24; + crc >>= 8; + } + else if (inlen == 2) + { + data = ((const struct u16_unaligned_s *)inbuf)->a; + data ^= crc; + data <<= 16; + crc >>= 16; + } + else + { + data = ((const struct u16_unaligned_s *)inbuf)->a; + data |= inbuf[2] << 16; + data ^= crc; + data <<= 8; + crc >>= 24; + } + + /* Barrett reduction */ + asm volatile ("movd %[in], %%xmm0\n\t" + "movd %[crc], %%xmm1\n\t" + + "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ + "psllq $32, %%xmm1\n\t" + "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */ + "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ + "pxor %%xmm1, %%xmm0\n\t" + + "pextrd $1, %%xmm0, %[out]\n\t" + : [out] "=m" (*pcrc) + : [in] "rm" (data), + [crc] "rm" (crc) + ); + } + else if (inlen == 4) + { + /* Barrett reduction */ + asm volatile ("movd %[crc], %%xmm1\n\t" + "movd %[in], %%xmm0\n\t" + "movdqa %[my_p], %%xmm5\n\t" + "pxor %%xmm1, %%xmm0\n\t" + + "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ + "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */ + "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ + + "pextrd $1, %%xmm0, %[out]\n\t" + : [out] "=m" (*pcrc) + : [in] "m" (*inbuf), + [crc] "m" (*pcrc), + [my_p] "m" (consts->my_p[0]) + ); + } + else + { + asm volatile ("movdqu %[shuf], %%xmm4\n\t" + "movd %[crc], %%xmm1\n\t" + "movdqa %[my_p], %%xmm5\n\t" + "movdqa %[k3k4], %%xmm6\n\t" + : + : [shuf] "m" (crc32_refl_shuf_shift[inlen]), + [crc] "m" (*pcrc), + [my_p] "m" (consts->my_p[0]), + [k3k4] "m" (consts->k[3 - 1]) + ); + + if (inlen >= 8) + { + asm volatile ("movq %[inbuf], %%xmm0\n\t" + : + : [inbuf] "m" (*inbuf) + ); + if (inlen > 8) + { + asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/ + "movq %[inbuf_tail], %%xmm2\n\t" + "punpcklqdq %%xmm2, %%xmm0\n\t" + "pshufb %[merge_shuf], %%xmm0\n\t" + : + : [inbuf_tail] "m" (inbuf[inlen - 8]), + [merge_shuf] "m" + (*crc32_merge9to15_shuf[inlen - 9]) + ); + } + } + else + { + asm volatile ("movd %[inbuf], %%xmm0\n\t" + "pinsrd $1, %[inbuf_tail], %%xmm0\n\t" + "pshufb %[merge_shuf], %%xmm0\n\t" + : + : [inbuf] "m" (*inbuf), + [inbuf_tail] "m" (inbuf[inlen - 4]), + [merge_shuf] "m" + (*crc32_merge5to7_shuf[inlen - 5]) + ); + } + + /* Final fold. */ + asm volatile ("pxor %%xmm1, %%xmm0\n\t" + "pshufb %%xmm4, %%xmm0\n\t" + + /* reduce 128-bits to 96-bits */ + "movdqa %%xmm0, %%xmm1\n\t" + "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t" + "psrldq $8, %%xmm1\n\t" + "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */ + + /* reduce 96-bits to 64-bits */ + "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */ + "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */ + "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */ + "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */ + + /* barrett reduction */ + "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */ + "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */ + "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ + "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */ + "pxor %%xmm1, %%xmm0\n\t" + + /* store CRC */ + "pextrd $2, %%xmm0, %[out]\n\t" + : [out] "=m" (*pcrc) + : [k5] "m" (consts->k[5 - 1]) + ); + } +} + +/* PCLMUL functions for non-reflected CRC32. */ +static ASM_FUNC_ATTR_INLINE void +crc32_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts) +{ + asm volatile ("movdqa %[bswap], %%xmm7\n\t" + : + : [bswap] "m" (*crc32_bswap_shuf) + ); + + if (inlen >= 8 * 16) + { + asm volatile ("movd %[crc], %%xmm4\n\t" + "movdqu %[inbuf_0], %%xmm0\n\t" + "movdqu %[inbuf_1], %%xmm1\n\t" + "movdqu %[inbuf_2], %%xmm2\n\t" + "pxor %%xmm4, %%xmm0\n\t" + "movdqu %[inbuf_3], %%xmm3\n\t" + "pshufb %%xmm7, %%xmm0\n\t" + "pshufb %%xmm7, %%xmm1\n\t" + "pshufb %%xmm7, %%xmm2\n\t" + "pshufb %%xmm7, %%xmm3\n\t" + : + : [inbuf_0] "m" (inbuf[0 * 16]), + [inbuf_1] "m" (inbuf[1 * 16]), + [inbuf_2] "m" (inbuf[2 * 16]), + [inbuf_3] "m" (inbuf[3 * 16]), + [crc] "m" (*pcrc) + ); + + inbuf += 4 * 16; + inlen -= 4 * 16; + + asm volatile ("movdqa %[k1k2], %%xmm4\n\t" + : + : [k1k2] "m" (consts->k[1 - 1]) + ); + + /* Fold by 4. */ + while (inlen >= 4 * 16) + { + asm volatile ("movdqu %[inbuf_0], %%xmm5\n\t" + "movdqa %%xmm0, %%xmm6\n\t" + "pshufb %%xmm7, %%xmm5\n\t" + "pclmulqdq $0x01, %%xmm4, %%xmm0\n\t" + "pclmulqdq $0x10, %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm0\n\t" + "pxor %%xmm6, %%xmm0\n\t" + + "movdqu %[inbuf_1], %%xmm5\n\t" + "movdqa %%xmm1, %%xmm6\n\t" + "pshufb %%xmm7, %%xmm5\n\t" + "pclmulqdq $0x01, %%xmm4, %%xmm1\n\t" + "pclmulqdq $0x10, %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm1\n\t" + "pxor %%xmm6, %%xmm1\n\t" + + "movdqu %[inbuf_2], %%xmm5\n\t" + "movdqa %%xmm2, %%xmm6\n\t" + "pshufb %%xmm7, %%xmm5\n\t" + "pclmulqdq $0x01, %%xmm4, %%xmm2\n\t" + "pclmulqdq $0x10, %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm2\n\t" + "pxor %%xmm6, %%xmm2\n\t" + + "movdqu %[inbuf_3], %%xmm5\n\t" + "movdqa %%xmm3, %%xmm6\n\t" + "pshufb %%xmm7, %%xmm5\n\t" + "pclmulqdq $0x01, %%xmm4, %%xmm3\n\t" + "pclmulqdq $0x10, %%xmm4, %%xmm6\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "pxor %%xmm6, %%xmm3\n\t" + : + : [inbuf_0] "m" (inbuf[0 * 16]), + [inbuf_1] "m" (inbuf[1 * 16]), + [inbuf_2] "m" (inbuf[2 * 16]), + [inbuf_3] "m" (inbuf[3 * 16]) + ); + + inbuf += 4 * 16; + inlen -= 4 * 16; + } + + asm volatile ("movdqa %[k3k4], %%xmm6\n\t" + "movdqa %[my_p], %%xmm5\n\t" + : + : [k3k4] "m" (consts->k[3 - 1]), + [my_p] "m" (consts->my_p[0]) + ); + + /* Fold 4 to 1. */ + + asm volatile ("movdqa %%xmm0, %%xmm4\n\t" + "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t" + "pxor %%xmm1, %%xmm0\n\t" + "pxor %%xmm4, %%xmm0\n\t" + + "movdqa %%xmm0, %%xmm4\n\t" + "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t" + "pxor %%xmm2, %%xmm0\n\t" + "pxor %%xmm4, %%xmm0\n\t" + + "movdqa %%xmm0, %%xmm4\n\t" + "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x10, %%xmm6, %%xmm4\n\t" + "pxor %%xmm3, %%xmm0\n\t" + "pxor %%xmm4, %%xmm0\n\t" + : + : + ); + } + else + { + asm volatile ("movd %[crc], %%xmm1\n\t" + "movdqu %[inbuf], %%xmm0\n\t" + "movdqa %[k3k4], %%xmm6\n\t" + "pxor %%xmm1, %%xmm0\n\t" + "movdqa %[my_p], %%xmm5\n\t" + "pshufb %%xmm7, %%xmm0\n\t" + : + : [inbuf] "m" (*inbuf), + [crc] "m" (*pcrc), + [k3k4] "m" (consts->k[3 - 1]), + [my_p] "m" (consts->my_p[0]) + ); + + inbuf += 16; + inlen -= 16; + } + + /* Fold by 1. */ + if (inlen >= 16) + { + while (inlen >= 16) + { + /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */ + asm volatile ("movdqu %[inbuf], %%xmm2\n\t" + "movdqa %%xmm0, %%xmm1\n\t" + "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t" + "pshufb %%xmm7, %%xmm2\n\t" + "pclmulqdq $0x10, %%xmm6, %%xmm1\n\t" + "pxor %%xmm2, %%xmm0\n\t" + "pxor %%xmm1, %%xmm0\n\t" + : + : [inbuf] "m" (*inbuf) + ); + + inbuf += 16; + inlen -= 16; + } + } + + /* Partial fold. */ + if (inlen) + { + /* Load last input and add padding zeros. */ + asm volatile ("movdqu %[shl_shuf], %%xmm4\n\t" + "movdqu %[shr_shuf], %%xmm3\n\t" + "movdqu %[mask], %%xmm2\n\t" + + "movdqa %%xmm0, %%xmm1\n\t" + "pshufb %%xmm4, %%xmm0\n\t" + "movdqu %[inbuf], %%xmm4\n\t" + "pshufb %%xmm3, %%xmm1\n\t" + "pand %%xmm4, %%xmm2\n\t" + "por %%xmm1, %%xmm2\n\t" + + "pshufb %%xmm7, %%xmm2\n\t" + + "movdqa %%xmm0, %%xmm1\n\t" + "pclmulqdq $0x01, %%xmm6, %%xmm0\n\t" + "pclmulqdq $0x10, %%xmm6, %%xmm1\n\t" + "pxor %%xmm2, %%xmm0\n\t" + "pxor %%xmm1, %%xmm0\n\t" + : + : [inbuf] "m" (*(inbuf - 16 + inlen)), + [mask] "m" (crc32_partial_fold_input_mask[inlen]), + [shl_shuf] "m" (crc32_refl_shuf_shift[32 - inlen]), + [shr_shuf] "m" (crc32_shuf_shift[inlen + 16]) + ); + + inbuf += inlen; + inlen -= inlen; + } + + /* Final fold. */ + asm volatile (/* reduce 128-bits to 96-bits */ + "movdqa %%xmm0, %%xmm1\n\t" + "pclmulqdq $0x11, %%xmm6, %%xmm0\n\t" + "pslldq $8, %%xmm1\n\t" + "pxor %%xmm1, %%xmm0\n\t" /* bottom 32-bit are zero */ + + /* reduce 96-bits to 64-bits */ + "pshufd $0x30, %%xmm0, %%xmm1\n\t" /* [00][x>>96][00][00] */ + "pshufd $0x24, %%xmm0, %%xmm0\n\t" /* [00][xx][xx][00] */ + "pclmulqdq $0x01, %[k5], %%xmm1\n\t" /* [00][xx][xx][00] */ + "pxor %%xmm1, %%xmm0\n\t" /* top and bottom 32-bit are zero */ + + /* barrett reduction */ + "pshufd $0x01, %%xmm0, %%xmm1\n\t" /* [00][00][00][x>>32] */ + "pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][xx] */ + "psrldq $4, %%xmm0\n\t" /* [00][00][xx][xx] */ + "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" + "pxor %%xmm1, %%xmm0\n\t" + + /* store CRC in input endian */ + "movd %%xmm0, %%eax\n\t" + "bswapl %%eax\n\t" + "movl %%eax, %[out]\n\t" + : [out] "=m" (*pcrc) + : [k5] "m" (consts->k[5 - 1]) + : "eax" ); +} + +static ASM_FUNC_ATTR_INLINE void +crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts) +{ + if (inlen < 4) + { + u32 crc = *pcrc; + u32 data; + + asm volatile ("movdqa %[my_p], %%xmm5\n\t" + : + : [my_p] "m" (consts->my_p[0]) + ); + + if (inlen == 1) + { + data = inbuf[0]; + data ^= crc; + data = _gcry_bswap32(data << 24); + crc = _gcry_bswap32(crc >> 8); + } + else if (inlen == 2) + { + data = ((const struct u16_unaligned_s *)inbuf)->a; + data ^= crc; + data = _gcry_bswap32(data << 16); + crc = _gcry_bswap32(crc >> 16); + } + else + { + data = ((const struct u16_unaligned_s *)inbuf)->a; + data |= inbuf[2] << 16; + data ^= crc; + data = _gcry_bswap32(data << 8); + crc = _gcry_bswap32(crc >> 24); + } + + /* Barrett reduction */ + asm volatile ("movd %[in], %%xmm0\n\t" + "psllq $32, %%xmm0\n\t" /* [00][00][xx][00] */ + "movd %[crc], %%xmm1\n\t" + + "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][00] */ + "pclmulqdq $0x11, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ + "pxor %%xmm1, %%xmm0\n\t" + + /* store CRC in input endian */ + "movd %%xmm0, %%eax\n\t" + "bswapl %%eax\n\t" + "movl %%eax, %[out]\n\t" + : [out] "=m" (*pcrc) + : [in] "r" (data), + [crc] "r" (crc) + : "eax" ); + } + else if (inlen == 4) + { + /* Barrett reduction */ + asm volatile ("movd %[crc], %%xmm0\n\t" + "movd %[in], %%xmm1\n\t" + "movdqa %[my_p], %%xmm5\n\t" + : + : [in] "m" (*inbuf), + [crc] "m" (*pcrc), + [my_p] "m" (consts->my_p[0]) + : "cc" ); + + asm volatile ("pxor %%xmm1, %%xmm0\n\t" + "pshufb %[bswap], %%xmm0\n\t" /* [xx][00][00][00] */ + + "pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][00] */ + "pclmulqdq $0x11, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */ + : + : [bswap] "m" (*crc32_bswap_shuf) + : "cc" ); + + asm volatile (/* store CRC in input endian */ + "movd %%xmm0, %%eax\n\t" + "bswapl %%eax\n\t" + "movl %%eax, %[out]\n\t" + : [out] "=m" (*pcrc) + : + : "eax", "cc" ); + } + else + { + asm volatile ("movdqu %[shuf], %%xmm7\n\t" + "movd %[crc], %%xmm1\n\t" + "movdqa %[my_p], %%xmm5\n\t" + "movdqa %[k3k4], %%xmm6\n\t" + : + : [shuf] "m" (crc32_shuf_shift[32 - inlen]), + [crc] "m" (*pcrc), + [my_p] "m" (consts->my_p[0]), + [k3k4] "m" (consts->k[3 - 1]) + ); + + if (inlen >= 8) + { + asm volatile ("movq %[inbuf], %%xmm0\n\t" + : + : [inbuf] "m" (*inbuf) + ); + if (inlen > 8) + { + asm volatile (/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/ + "movq %[inbuf_tail], %%xmm2\n\t" + "punpcklqdq %%xmm2, %%xmm0\n\t" + "pshufb %[merge_shuf], %%xmm0\n\t" + : + : [inbuf_tail] "m" (inbuf[inlen - 8]), + [merge_shuf] "m" + (*crc32_merge9to15_shuf[inlen - 9]) + ); + } + } + else + { + asm volatile ("movd %[inbuf], %%xmm0\n\t" + "pinsrd $1, %[inbuf_tail], %%xmm0\n\t" + "pshufb %[merge_shuf], %%xmm0\n\t" + : + : [inbuf] "m" (*inbuf), + [inbuf_tail] "m" (inbuf[inlen - 4]), + [merge_shuf] "m" + (*crc32_merge5to7_shuf[inlen - 5]) + ); + } + + /* Final fold. */ + asm volatile ("pxor %%xmm1, %%xmm0\n\t" + "pshufb %%xmm7, %%xmm0\n\t" + + /* reduce 128-bits to 96-bits */ + "movdqa %%xmm0, %%xmm1\n\t" + "pclmulqdq $0x11, %%xmm6, %%xmm0\n\t" + "pslldq $8, %%xmm1\n\t" + "pxor %%xmm1, %%xmm0\n\t" /* bottom 32-bit are zero */ + + /* reduce 96-bits to 64-bits */ + "pshufd $0x30, %%xmm0, %%xmm1\n\t" /* [00][x>>96][00][00] */ + "pshufd $0x24, %%xmm0, %%xmm0\n\t" /* [00][xx][xx][00] */ + "pclmulqdq $0x01, %[k5], %%xmm1\n\t" /* [00][xx][xx][00] */ + "pxor %%xmm1, %%xmm0\n\t" /* top and bottom 32-bit are zero */ + + /* barrett reduction */ + "pshufd $0x01, %%xmm0, %%xmm1\n\t" /* [00][00][00][x>>32] */ + "pclmulqdq $0x01, %%xmm5, %%xmm0\n\t" /* [00][xx][xx][xx] */ + "psrldq $4, %%xmm0\n\t" /* [00][00][xx][xx] */ + "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" + "pxor %%xmm1, %%xmm0\n\t" + + /* store CRC in input endian */ + "movd %%xmm0, %%eax\n\t" + "bswapl %%eax\n\t" + "movl %%eax, %[out]\n\t" + : [out] "=m" (*pcrc) + : [k5] "m" (consts->k[5 - 1]) + : "eax" ); + } +} + +void ASM_FUNC_ATTR +_gcry_crc32_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen) +{ + const struct crc32_consts_s *consts = &crc32_consts; +#if defined(__x86_64__) && defined(__WIN64__) + char win64tmp[2 * 16]; + + /* XMM6-XMM7 need to be restored after use. */ + asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" + "movdqu %%xmm7, 1*16(%0)\n\t" + : + : "r" (win64tmp) + : "memory"); +#endif + + if (!inlen) + return; + + if (inlen >= 16) + crc32_reflected_bulk(pcrc, inbuf, inlen, consts); + else + crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts); + +#if defined(__x86_64__) && defined(__WIN64__) + /* Restore used registers. */ + asm volatile("movdqu 0*16(%0), %%xmm6\n\t" + "movdqu 1*16(%0), %%xmm7\n\t" + : + : "r" (win64tmp) + : "memory"); +#endif +} + +void ASM_FUNC_ATTR +_gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen) +{ + const struct crc32_consts_s *consts = &crc24rfc2440_consts; +#if defined(__x86_64__) && defined(__WIN64__) + char win64tmp[2 * 16]; + + /* XMM6-XMM7 need to be restored after use. */ + asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" + "movdqu %%xmm7, 1*16(%0)\n\t" + : + : "r" (win64tmp) + : "memory"); +#endif + + if (!inlen) + return; + + /* Note: *pcrc in input endian. */ + + if (inlen >= 16) + crc32_bulk(pcrc, inbuf, inlen, consts); + else + crc32_less_than_16(pcrc, inbuf, inlen, consts); + +#if defined(__x86_64__) && defined(__WIN64__) + /* Restore used registers. */ + asm volatile("movdqu 0*16(%0), %%xmm6\n\t" + "movdqu 1*16(%0), %%xmm7\n\t" + : + : "r" (win64tmp) + : "memory"); +#endif +} + +#if __clang__ +# pragma clang attribute pop +#endif + +#endif /* USE_INTEL_PCLMUL */ |