/* * Calculate the checksum of data that is 16 byte aligned and a multiple of * 16 bytes. * * The first step is to reduce it to 1024 bits. We do this in 8 parallel * chunks in order to mask the latency of the vpmsum instructions. If we * have more than 32 kB of data to checksum we repeat this step multiple * times, passing in the previous 1024 bits. * * The next step is to reduce the 1024 bits to 64 bits. This step adds * 32 bits of 0s to the end - this matches what a CRC does. We just * calculate constants that land the data in this 32 bits. * * We then use fixed point Barrett reduction to compute a mod n over GF(2) * for n = CRC using POWER8 instructions. We use x = 32. * * http://en.wikipedia.org/wiki/Barrett_reduction * * This code uses gcc vector builtins instead using assembly directly. * * Copyright (C) 2017 Rogerio Alves , IBM * * This program is free software; you can redistribute it and/or * modify it under the terms of either: * * a) the GNU General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) * any later version, or * b) the Apache License, Version 2.0 */ #include #define VMX_ALIGN 16 #define VMX_ALIGN_MASK (VMX_ALIGN-1) #ifdef REFLECT static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) { while (len--) crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); return crc; } #else static unsigned int crc32_align(unsigned int crc, const unsigned char *p, unsigned long len) { while (len--) crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8); return crc; } #endif static unsigned int __attribute__ ((aligned (32))) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len); unsigned int CRC32_FUNCTION(unsigned int crc, const unsigned char *p, unsigned long len) { unsigned int prealign; unsigned int tail; #ifdef CRC_XOR crc ^= 0xffffffff; #endif if (len < VMX_ALIGN + VMX_ALIGN_MASK) { crc = crc32_align(crc, p, len); goto out; } if ((unsigned long)p & VMX_ALIGN_MASK) { prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); crc = crc32_align(crc, p, prealign); len -= prealign; p += prealign; } crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); tail = len & VMX_ALIGN_MASK; if (tail) { p += len & ~VMX_ALIGN_MASK; crc = crc32_align(crc, p, tail); } out: #ifdef CRC_XOR crc ^= 0xffffffff; #endif return crc; } #if defined (__clang__) #include "clang_workaround.h" #else #define __builtin_pack_vector(a, b) __builtin_pack_vector_int128 ((a), (b)) #define __builtin_unpack_vector_0(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 0) #define __builtin_unpack_vector_1(a) __builtin_unpack_vector_int128 ((vector __int128_t)(a), 1) #endif /* When we have a load-store in a single-dispatch group and address overlap * such that foward is not allowed (load-hit-store) the group must be flushed. * A group ending NOP prevents the flush. */ #define GROUP_ENDING_NOP asm("ori 2,2,0" ::: "memory") #if defined(__BIG_ENDIAN__) && defined (REFLECT) #define BYTESWAP_DATA #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) #define BYTESWAP_DATA #endif #ifdef BYTESWAP_DATA #define VEC_PERM(vr, va, vb, vc) vr = vec_perm(va, vb,\ (__vector unsigned char) vc) #if defined(__LITTLE_ENDIAN__) /* Byte reverse permute constant LE. */ static const __vector unsigned long long vperm_const __attribute__ ((aligned(16))) = { 0x08090A0B0C0D0E0FUL, 0x0001020304050607UL }; #else static const __vector unsigned long long vperm_const __attribute__ ((aligned(16))) = { 0x0F0E0D0C0B0A0908UL, 0X0706050403020100UL }; #endif #else #define VEC_PERM(vr, va, vb, vc) #endif static unsigned int __attribute__ ((aligned (32))) __crc32_vpmsum(unsigned int crc, const void* p, unsigned long len) { const __vector unsigned long long vzero = {0,0}; const __vector unsigned long long vones = {0xffffffffffffffffUL, 0xffffffffffffffffUL}; #ifdef REFLECT __vector unsigned char vsht_splat; const __vector unsigned long long vmask_32bit = (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 4); #endif const __vector unsigned long long vmask_64bit = (__vector unsigned long long)vec_sld((__vector unsigned char)vzero, (__vector unsigned char)vones, 8); __vector unsigned long long vcrc; __vector unsigned long long vconst1, vconst2; /* vdata0-vdata7 will contain our data (p). */ __vector unsigned long long vdata0, vdata1, vdata2, vdata3, vdata4, vdata5, vdata6, vdata7; /* v0-v7 will contain our checksums */ __vector unsigned long long v0 = {0,0}; __vector unsigned long long v1 = {0,0}; __vector unsigned long long v2 = {0,0}; __vector unsigned long long v3 = {0,0}; __vector unsigned long long v4 = {0,0}; __vector unsigned long long v5 = {0,0}; __vector unsigned long long v6 = {0,0}; __vector unsigned long long v7 = {0,0}; /* Vector auxiliary variables. */ __vector unsigned long long va0, va1, va2, va3, va4, va5, va6, va7; unsigned int result = 0; unsigned int offset; /* Constant table offset. */ unsigned long i; /* Counter. */ unsigned long chunks; unsigned long block_size; int next_block = 0; /* Align by 128 bits. The last 128 bit block will be processed at end. */ unsigned long length = len & 0xFFFFFFFFFFFFFF80UL; #ifdef REFLECT vcrc = (__vector unsigned long long)__builtin_pack_vector(0UL, crc); #else vcrc = (__vector unsigned long long)__builtin_pack_vector(crc, 0UL); /* Shift into top 32 bits */ vcrc = (__vector unsigned long long)vec_sld((__vector unsigned char)vcrc, (__vector unsigned char)vzero, 4); #endif /* Short version. */ if (len < 256) { /* Calculate where in the constant table we need to start. */ offset = 256 - len; vconst1 = vec_ld(offset, vcrc_short_const); vdata0 = vec_ld(0, (__vector unsigned long long*) p); VEC_PERM(vdata0, vdata0, vconst1, vperm_const); /* xor initial value*/ vdata0 = vec_xor(vdata0, vcrc); vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw ((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); v0 = vec_xor(v0, vdata0); for (i = 16; i < len; i += 16) { vconst1 = vec_ld(offset + i, vcrc_short_const); vdata0 = vec_ld(i, (__vector unsigned long long*) p); VEC_PERM(vdata0, vdata0, vconst1, vperm_const); vdata0 = (__vector unsigned long long) __builtin_crypto_vpmsumw ((__vector unsigned int)vdata0, (__vector unsigned int)vconst1); v0 = vec_xor(v0, vdata0); } } else { /* Load initial values. */ vdata0 = vec_ld(0, (__vector unsigned long long*) p); vdata1 = vec_ld(16, (__vector unsigned long long*) p); VEC_PERM(vdata0, vdata0, vdata0, vperm_const); VEC_PERM(vdata1, vdata1, vdata1, vperm_const); vdata2 = vec_ld(32, (__vector unsigned long long*) p); vdata3 = vec_ld(48, (__vector unsigned long long*) p); VEC_PERM(vdata2, vdata2, vdata2, vperm_const); VEC_PERM(vdata3, vdata3, vdata3, vperm_const); vdata4 = vec_ld(64, (__vector unsigned long long*) p); vdata5 = vec_ld(80, (__vector unsigned long long*) p); VEC_PERM(vdata4, vdata4, vdata4, vperm_const); VEC_PERM(vdata5, vdata5, vdata5, vperm_const); vdata6 = vec_ld(96, (__vector unsigned long long*) p); vdata7 = vec_ld(112, (__vector unsigned long long*) p); VEC_PERM(vdata6, vdata6, vdata6, vperm_const); VEC_PERM(vdata7, vdata7, vdata7, vperm_const); /* xor in initial value */ vdata0 = vec_xor(vdata0, vcrc); p = (char *)p + 128; do { /* Checksum in blocks of MAX_SIZE. */ block_size = length; if (block_size > MAX_SIZE) { block_size = MAX_SIZE; } length = length - block_size; /* * Work out the offset into the constants table to start at. Each * constant is 16 bytes, and it is used against 128 bytes of input * data - 128 / 16 = 8 */ offset = (MAX_SIZE/8) - (block_size/8); /* We reduce our final 128 bytes in a separate step */ chunks = (block_size/128)-1; vconst1 = vec_ld(offset, vcrc_const); va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0, (__vector unsigned long long)vconst1); va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1, (__vector unsigned long long)vconst1); va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2, (__vector unsigned long long)vconst1); va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3, (__vector unsigned long long)vconst1); va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4, (__vector unsigned long long)vconst1); va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5, (__vector unsigned long long)vconst1); va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6, (__vector unsigned long long)vconst1); va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7, (__vector unsigned long long)vconst1); if (chunks > 1) { offset += 16; vconst2 = vec_ld(offset, vcrc_const); GROUP_ENDING_NOP; vdata0 = vec_ld(0, (__vector unsigned long long*) p); VEC_PERM(vdata0, vdata0, vdata0, vperm_const); vdata1 = vec_ld(16, (__vector unsigned long long*) p); VEC_PERM(vdata1, vdata1, vdata1, vperm_const); vdata2 = vec_ld(32, (__vector unsigned long long*) p); VEC_PERM(vdata2, vdata2, vdata2, vperm_const); vdata3 = vec_ld(48, (__vector unsigned long long*) p); VEC_PERM(vdata3, vdata3, vdata3, vperm_const); vdata4 = vec_ld(64, (__vector unsigned long long*) p); VEC_PERM(vdata4, vdata4, vdata4, vperm_const); vdata5 = vec_ld(80, (__vector unsigned long long*) p); VEC_PERM(vdata5, vdata5, vdata5, vperm_const); vdata6 = vec_ld(96, (__vector unsigned long long*) p); VEC_PERM(vdata6, vdata6, vdata6, vperm_const); vdata7 = vec_ld(112, (__vector unsigned long long*) p); VEC_PERM(vdata7, vdata7, vdata7, vperm_const); p = (char *)p + 128; /* * main loop. We modulo schedule it such that it takes three * iterations to complete - first iteration load, second * iteration vpmsum, third iteration xor. */ for (i = 0; i < chunks-2; i++) { vconst1 = vec_ld(offset, vcrc_const); offset += 16; GROUP_ENDING_NOP; v0 = vec_xor(v0, va0); va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0, (__vector unsigned long long)vconst2); vdata0 = vec_ld(0, (__vector unsigned long long*) p); VEC_PERM(vdata0, vdata0, vdata0, vperm_const); GROUP_ENDING_NOP; v1 = vec_xor(v1, va1); va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1, (__vector unsigned long long)vconst2); vdata1 = vec_ld(16, (__vector unsigned long long*) p); VEC_PERM(vdata1, vdata1, vdata1, vperm_const); GROUP_ENDING_NOP; v2 = vec_xor(v2, va2); va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2, (__vector unsigned long long)vconst2); vdata2 = vec_ld(32, (__vector unsigned long long*) p); VEC_PERM(vdata2, vdata2, vdata2, vperm_const); GROUP_ENDING_NOP; v3 = vec_xor(v3, va3); va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3, (__vector unsigned long long)vconst2); vdata3 = vec_ld(48, (__vector unsigned long long*) p); VEC_PERM(vdata3, vdata3, vdata3, vperm_const); vconst2 = vec_ld(offset, vcrc_const); GROUP_ENDING_NOP; v4 = vec_xor(v4, va4); va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4, (__vector unsigned long long)vconst1); vdata4 = vec_ld(64, (__vector unsigned long long*) p); VEC_PERM(vdata4, vdata4, vdata4, vperm_const); GROUP_ENDING_NOP; v5 = vec_xor(v5, va5); va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5, (__vector unsigned long long)vconst1); vdata5 = vec_ld(80, (__vector unsigned long long*) p); VEC_PERM(vdata5, vdata5, vdata5, vperm_const); GROUP_ENDING_NOP; v6 = vec_xor(v6, va6); va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6, (__vector unsigned long long)vconst1); vdata6 = vec_ld(96, (__vector unsigned long long*) p); VEC_PERM(vdata6, vdata6, vdata6, vperm_const); GROUP_ENDING_NOP; v7 = vec_xor(v7, va7); va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7, (__vector unsigned long long)vconst1); vdata7 = vec_ld(112, (__vector unsigned long long*) p); VEC_PERM(vdata7, vdata7, vdata7, vperm_const); p = (char *)p + 128; } /* First cool down*/ vconst1 = vec_ld(offset, vcrc_const); offset += 16; v0 = vec_xor(v0, va0); va0 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata0, (__vector unsigned long long)vconst1); GROUP_ENDING_NOP; v1 = vec_xor(v1, va1); va1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata1, (__vector unsigned long long)vconst1); GROUP_ENDING_NOP; v2 = vec_xor(v2, va2); va2 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata2, (__vector unsigned long long)vconst1); GROUP_ENDING_NOP; v3 = vec_xor(v3, va3); va3 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata3, (__vector unsigned long long)vconst1); GROUP_ENDING_NOP; v4 = vec_xor(v4, va4); va4 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata4, (__vector unsigned long long)vconst1); GROUP_ENDING_NOP; v5 = vec_xor(v5, va5); va5 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata5, (__vector unsigned long long)vconst1); GROUP_ENDING_NOP; v6 = vec_xor(v6, va6); va6 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata6, (__vector unsigned long long)vconst1); GROUP_ENDING_NOP; v7 = vec_xor(v7, va7); va7 = __builtin_crypto_vpmsumd ((__vector unsigned long long)vdata7, (__vector unsigned long long)vconst1); }/* else */ /* Second cool down. */ v0 = vec_xor(v0, va0); v1 = vec_xor(v1, va1); v2 = vec_xor(v2, va2); v3 = vec_xor(v3, va3); v4 = vec_xor(v4, va4); v5 = vec_xor(v5, va5); v6 = vec_xor(v6, va6); v7 = vec_xor(v7, va7); #ifdef REFLECT /* * vpmsumd produces a 96 bit result in the least significant bits * of the register. Since we are bit reflected we have to shift it * left 32 bits so it occupies the least significant bits in the * bit reflected domain. */ v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, (__vector unsigned char)vzero, 4); v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v1, (__vector unsigned char)vzero, 4); v2 = (__vector unsigned long long)vec_sld((__vector unsigned char)v2, (__vector unsigned char)vzero, 4); v3 = (__vector unsigned long long)vec_sld((__vector unsigned char)v3, (__vector unsigned char)vzero, 4); v4 = (__vector unsigned long long)vec_sld((__vector unsigned char)v4, (__vector unsigned char)vzero, 4); v5 = (__vector unsigned long long)vec_sld((__vector unsigned char)v5, (__vector unsigned char)vzero, 4); v6 = (__vector unsigned long long)vec_sld((__vector unsigned char)v6, (__vector unsigned char)vzero, 4); v7 = (__vector unsigned long long)vec_sld((__vector unsigned char)v7, (__vector unsigned char)vzero, 4); #endif /* xor with the last 1024 bits. */ va0 = vec_ld(0, (__vector unsigned long long*) p); VEC_PERM(va0, va0, va0, vperm_const); va1 = vec_ld(16, (__vector unsigned long long*) p); VEC_PERM(va1, va1, va1, vperm_const); va2 = vec_ld(32, (__vector unsigned long long*) p); VEC_PERM(va2, va2, va2, vperm_const); va3 = vec_ld(48, (__vector unsigned long long*) p); VEC_PERM(va3, va3, va3, vperm_const); va4 = vec_ld(64, (__vector unsigned long long*) p); VEC_PERM(va4, va4, va4, vperm_const); va5 = vec_ld(80, (__vector unsigned long long*) p); VEC_PERM(va5, va5, va5, vperm_const); va6 = vec_ld(96, (__vector unsigned long long*) p); VEC_PERM(va6, va6, va6, vperm_const); va7 = vec_ld(112, (__vector unsigned long long*) p); VEC_PERM(va7, va7, va7, vperm_const); p = (char *)p + 128; vdata0 = vec_xor(v0, va0); vdata1 = vec_xor(v1, va1); vdata2 = vec_xor(v2, va2); vdata3 = vec_xor(v3, va3); vdata4 = vec_xor(v4, va4); vdata5 = vec_xor(v5, va5); vdata6 = vec_xor(v6, va6); vdata7 = vec_xor(v7, va7); /* Check if we have more blocks to process */ next_block = 0; if (length != 0) { next_block = 1; /* zero v0-v7 */ v0 = vec_xor(v0, v0); v1 = vec_xor(v1, v1); v2 = vec_xor(v2, v2); v3 = vec_xor(v3, v3); v4 = vec_xor(v4, v4); v5 = vec_xor(v5, v5); v6 = vec_xor(v6, v6); v7 = vec_xor(v7, v7); } length = length + 128; } while (next_block); /* Calculate how many bytes we have left. */ length = (len & 127); /* Calculate where in (short) constant table we need to start. */ offset = 128 - length; v0 = vec_ld(offset, vcrc_short_const); v1 = vec_ld(offset + 16, vcrc_short_const); v2 = vec_ld(offset + 32, vcrc_short_const); v3 = vec_ld(offset + 48, vcrc_short_const); v4 = vec_ld(offset + 64, vcrc_short_const); v5 = vec_ld(offset + 80, vcrc_short_const); v6 = vec_ld(offset + 96, vcrc_short_const); v7 = vec_ld(offset + 112, vcrc_short_const); offset += 128; v0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata0,(__vector unsigned int)v0); v1 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata1,(__vector unsigned int)v1); v2 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata2,(__vector unsigned int)v2); v3 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata3,(__vector unsigned int)v3); v4 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata4,(__vector unsigned int)v4); v5 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata5,(__vector unsigned int)v5); v6 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata6,(__vector unsigned int)v6); v7 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata7,(__vector unsigned int)v7); /* Now reduce the tail (0-112 bytes). */ for (i = 0; i < length; i+=16) { vdata0 = vec_ld(i,(__vector unsigned long long*)p); VEC_PERM(vdata0, vdata0, vdata0, vperm_const); va0 = vec_ld(offset + i,vcrc_short_const); va0 = (__vector unsigned long long)__builtin_crypto_vpmsumw ( (__vector unsigned int)vdata0,(__vector unsigned int)va0); v0 = vec_xor(v0, va0); } /* xor all parallel chunks together. */ v0 = vec_xor(v0, v1); v2 = vec_xor(v2, v3); v4 = vec_xor(v4, v5); v6 = vec_xor(v6, v7); v0 = vec_xor(v0, v2); v4 = vec_xor(v4, v6); v0 = vec_xor(v0, v4); } /* Barrett Reduction */ vconst1 = vec_ld(0, v_Barrett_const); vconst2 = vec_ld(16, v_Barrett_const); v1 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, (__vector unsigned char)v0, 8); v0 = vec_xor(v1,v0); #ifdef REFLECT /* shift left one bit */ vsht_splat = vec_splat_u8 (1); v0 = (__vector unsigned long long)vec_sll ((__vector unsigned char)v0, vsht_splat); #endif v0 = vec_and(v0, vmask_64bit); #ifndef REFLECT /* * Now for the actual algorithm. The idea is to calculate q, * the multiple of our polynomial that we need to subtract. By * doing the computation 2x bits higher (ie 64 bits) and shifting the * result back down 2x bits, we round down to the nearest multiple. */ /* ma */ v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v0, (__vector unsigned long long)vconst1); /* q = floor(ma/(2^64)) */ v1 = (__vector unsigned long long)vec_sld ((__vector unsigned char)vzero, (__vector unsigned char)v1, 8); /* qn */ v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, (__vector unsigned long long)vconst2); /* a - qn, subtraction is xor in GF(2) */ v0 = vec_xor (v0, v1); /* * Get the result into r3. We need to shift it left 8 bytes: * V0 [ 0 1 2 X ] * V0 [ 0 X 2 3 ] */ result = __builtin_unpack_vector_1 (v0); #else /* * The reflected version of Barrett reduction. Instead of bit * reflecting our data (which is expensive to do), we bit reflect our * constants and our algorithm, which means the intermediate data in * our vector registers goes from 0-63 instead of 63-0. We can reflect * the algorithm because we don't carry in mod 2 arithmetic. */ /* bottom 32 bits of a */ v1 = vec_and(v0, vmask_32bit); /* ma */ v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, (__vector unsigned long long)vconst1); /* bottom 32bits of ma */ v1 = vec_and(v1, vmask_32bit); /* qn */ v1 = __builtin_crypto_vpmsumd ((__vector unsigned long long)v1, (__vector unsigned long long)vconst2); /* a - qn, subtraction is xor in GF(2) */ v0 = vec_xor (v0, v1); /* * Since we are bit reflected, the result (ie the low 32 bits) is in * the high 32 bits. We just need to shift it left 4 bytes * V0 [ 0 1 X 3 ] * V0 [ 0 X 2 3 ] */ /* shift result into top 64 bits of */ v0 = (__vector unsigned long long)vec_sld((__vector unsigned char)v0, (__vector unsigned char)vzero, 4); result = __builtin_unpack_vector_0 (v0); #endif return result; }