summaryrefslogtreecommitdiffstats
path: root/src/common/crc32c_ppc_fast_zero_asm.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/common/crc32c_ppc_fast_zero_asm.S')
-rw-r--r--src/common/crc32c_ppc_fast_zero_asm.S126
1 files changed, 126 insertions, 0 deletions
diff --git a/src/common/crc32c_ppc_fast_zero_asm.S b/src/common/crc32c_ppc_fast_zero_asm.S
new file mode 100644
index 000000000..cff9cce7f
--- /dev/null
+++ b/src/common/crc32c_ppc_fast_zero_asm.S
@@ -0,0 +1,126 @@
+/*
+ * Use the fixed point version of Barrett reduction to compute a mod n
+ * over GF(2) for n = 0x104c11db7 using POWER8 instructions. We use k = 32.
+ *
+ * http://en.wikipedia.org/wiki/Barrett_reduction
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of either:
+ *
+ * a) the GNU General Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your option)
+ * any later version, or
+ * b) the Apache License, Version 2.0
+ */
+
+#if defined (__clang__)
+#ifndef __ALTIVEC__
+#define __ALTIVEC__
+#endif
+#include "ppc-asm.h"
+#else
+#include <ppc-asm.h>
+#endif
+#include "ppc-opcode.h"
+
+ .section .data
+.balign 16
+.constants:
+ /* Barrett constant m - (4^32)/n */
+ .octa 0x00000000000000000000000104d101df
+
+ /* Barrett constant n */
+ .octa 0x00000000000000000000000104c11db7
+
+.bit_reflected_constants:
+ /* 33 bit reflected Barrett constant m - (4^32)/n */
+ .octa 0x000000000000000000000001f7011641
+
+ /* 33 bit reflected Barrett constant n */
+ .octa 0x000000000000000000000001db710641
+
+ .text
+
+/* unsigned int barrett_reduction(unsigned long val) */
+FUNC_START(barrett_reduction)
+ lis r4,.constants@ha
+ la r4,.constants@l(r4)
+
+ li r5,16
+ vxor v1,v1,v1 /* zero v1 */
+
+ /* Get a into v0 */
+ MTVRD(v0, r3)
+ vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */
+
+ /* Load constants */
+ lvx v2,0,r4 /* m */
+ lvx v3,r5,r4 /* n */
+
+ /*
+ * Now for the actual algorithm. The idea is to calculate q,
+ * the multiple of our polynomial that we need to subtract. By
+ * doing the computation 2x bits higher (ie 64 bits) and shifting the
+ * result back down 2x bits, we round down to the nearest multiple.
+ */
+ VPMSUMD(v4,v0,v2) /* ma */
+ vsldoi v4,v1,v4,8 /* q = floor(ma/(2^64)) */
+ VPMSUMD(v4,v4,v3) /* qn */
+ vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Get the result into r3. We need to shift it left 8 bytes:
+ * V0 [ 0 1 2 X ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,v1,8 /* shift result into top 64 bits of v0 */
+ MFVRD(r3, v0)
+
+ blr
+FUNC_END(barrett_reduction)
+
+/* unsigned int barrett_reduction_reflected(unsigned long val) */
+FUNC_START(barrett_reduction_reflected)
+ lis r4,.bit_reflected_constants@ha
+ la r4,.bit_reflected_constants@l(r4)
+
+ li r5,16
+ vxor v1,v1,v1 /* zero v1 */
+
+ /* Get a into v0 */
+ MTVRD(v0, r3)
+ vsldoi v0,v1,v0,8 /* shift into bottom 64 bits, this is a */
+
+ /* Load constants */
+ lvx v2,0,r4 /* m */
+ lvx v3,r5,r4 /* n */
+
+ vspltisw v5,-1 /* all ones */
+ vsldoi v6,v1,v5,4 /* bitmask with low 32 bits set */
+
+ /*
+ * Now for the Barrett reduction algorithm. Instead of bit reflecting
+ * our data (which is expensive to do), we bit reflect our constants
+ * and our algorithm, which means the intermediate data in our vector
+ * registers goes from 0-63 instead of 63-0. We can reflect the
+ * algorithm because we don't carry in mod 2 arithmetic.
+ */
+ vand v4,v0,v6 /* bottom 32 bits of a */
+ VPMSUMD(v4,v4,v2) /* ma */
+ vand v4,v4,v6 /* bottom 32bits of ma */
+ VPMSUMD(v4,v4,v3) /* qn */
+ vxor v0,v0,v4 /* a - qn, subtraction is xor in GF(2) */
+
+ /*
+ * Since we are bit reflected, the result (ie the low 32 bits) is in the
+ * high 32 bits. We just need to shift it left 4 bytes
+ * V0 [ 0 1 X 3 ]
+ * V0 [ 0 X 2 3 ]
+ */
+ vsldoi v0,v0,v1,4 /* shift result into top 64 bits of v0 */
+ MFVRD(r3, v0)
+
+ blr
+FUNC_END(barrett_reduction_reflected)