diff options
Diffstat (limited to 'src/common/crc32c_ppc_fast_zero_asm.S')
-rw-r--r-- | src/common/crc32c_ppc_fast_zero_asm.S | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/src/common/crc32c_ppc_fast_zero_asm.S b/src/common/crc32c_ppc_fast_zero_asm.S new file mode 100644 index 000000000..cff9cce7f --- /dev/null +++ b/src/common/crc32c_ppc_fast_zero_asm.S @@ -0,0 +1,126 @@ +/* + * Use the fixed point version of Barrett reduction to compute a mod n + * over GF(2) for n = 0x104c11db7 using POWER8 instructions. We use k = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of either: + * + * a) the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) + * any later version, or + * b) the Apache License, Version 2.0 + */ + +#if defined (__clang__) +#ifndef __ALTIVEC__ +#define __ALTIVEC__ +#endif +#include "ppc-asm.h" +#else +#include <ppc-asm.h> +#endif +#include "ppc-opcode.h" + + .section .data +.balign 16 +.constants: + /* Barrett constant m - (4^32)/n */ + .octa 0x00000000000000000000000104d101df + + /* Barrett constant n */ + .octa 0x00000000000000000000000104c11db7 + +.bit_reflected_constants: + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000001f7011641 + + /* 33 bit reflected Barrett constant n */ + .octa 0x000000000000000000000001db710641 + + .text + +/* unsigned int barrett_reduction(unsigned long val) */ +FUNC_START(barrett_reduction) + lis r4,.constants@ha + la r4,.constants@l(r4) + + li r5,16 + vxor v1,v1,v1 /* zero v1 */ + + /* Get a into v0 */ + MTVRD(v0, r3) + vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */ + + /* Load constants */ + lvx v2,0,r4 /* m */ + lvx v3,r5,r4 /* n */ + + /* + * Now for the actual algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v4,v0,v2) /* ma */ + vsldoi v4,v1,v4,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v4,v4,v3) /* qn */ + vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,v1,8 /* shift result into top 64 bits of v0 */ + MFVRD(r3, v0) + + blr +FUNC_END(barrett_reduction) + +/* unsigned int barrett_reduction_reflected(unsigned long val) */ +FUNC_START(barrett_reduction_reflected) + lis r4,.bit_reflected_constants@ha + la r4,.bit_reflected_constants@l(r4) + + li r5,16 + vxor v1,v1,v1 /* zero v1 */ + + /* Get a into v0 */ + MTVRD(v0, r3) + vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */ + + /* Load constants */ + lvx v2,0,r4 /* m */ + lvx v3,r5,r4 /* n */ + + vspltisw v5,-1 /* all ones */ + vsldoi v6,v1,v5,4 /* bitmask with low 32 bits set */ + + /* + * Now for the Barrett reduction algorithm. Instead of bit reflecting + * our data (which is expensive to do), we bit reflect our constants + * and our algorithm, which means the intermediate data in our vector + * registers goes from 0-63 instead of 63-0. We can reflect the + * algorithm because we don't carry in mod 2 arithmetic. + */ + vand v4,v0,v6 /* bottom 32 bits of a */ + VPMSUMD(v4,v4,v2) /* ma */ + vand v4,v4,v6 /* bottom 32bits of ma */ + VPMSUMD(v4,v4,v3) /* qn */ + vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in the + * high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,v1,4 /* shift result into top 64 bits of v0 */ + MFVRD(r3, v0) + + blr +FUNC_END(barrett_reduction_reflected) |