summaryrefslogtreecommitdiffstats
path: root/src/hash/crc32/crc32_ppc64le.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/hash/crc32/crc32_ppc64le.s')
-rw-r--r--src/hash/crc32/crc32_ppc64le.s705
1 files changed, 705 insertions, 0 deletions
diff --git a/src/hash/crc32/crc32_ppc64le.s b/src/hash/crc32/crc32_ppc64le.s
new file mode 100644
index 0000000..84ef213
--- /dev/null
+++ b/src/hash/crc32/crc32_ppc64le.s
@@ -0,0 +1,705 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The vectorized implementation found below is a derived work
+// from code written by Anton Blanchard <anton@au.ibm.com> found
+// at https://github.com/antonblanchard/crc32-vpmsum. The original
+// is dual licensed under GPL and Apache 2. As the copyright holder
+// for the work, IBM has contributed this new work under
+// the golang license.
+
+// Changes include porting to Go assembler with modifications for
+// the Go ABI for ppc64le.
+
+#include "textflag.h"
+
+#define POWER8_OFFSET 132
+
+#define off16 R16
+#define off32 R17
+#define off48 R18
+#define off64 R19
+#define off80 R20
+#define off96 R21
+#define off112 R22
+
+#define const1 V24
+#define const2 V25
+
+#define byteswap V26
+#define mask_32bit V27
+#define mask_64bit V28
+#define zeroes V29
+
+#define MAX_SIZE 32*1024
+#define REFLECT
+
+TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
+ MOVWZ crc+0(FP), R3 // incoming crc
+ MOVD table8+8(FP), R4 // *Table
+ MOVD p+16(FP), R5
+ MOVD p_len+24(FP), R6 // p len
+
+ CMP $0,R6 // len == 0?
+ BNE start
+ MOVW R3,ret+40(FP) // return crc
+ RET
+
+start:
+ NOR R3,R3,R7 // ^crc
+ MOVWZ R7,R7 // 32 bits
+ CMP R6,$16
+ MOVD R6,CTR
+ BLT short
+ SRAD $3,R6,R8 // 8 byte chunks
+ MOVD R8,CTR
+
+loop:
+ MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian?
+ MOVWZ 4(R5),R9 // 4-7 bytes of p
+ MOVD R4,R10 // &tab[0]
+ XOR R7,R8,R7 // crc ^= byte[0:3]
+ RLDICL $40,R9,$56,R17 // p[7]
+ SLD $2,R17,R17 // p[7]*4
+ RLDICL $40,R7,$56,R8 // crc>>24
+ ADD R17,R10,R17 // &tab[0][p[7]]
+ SLD $2,R8,R8 // crc>>24*4
+ RLDICL $48,R9,$56,R18 // p[6]
+ SLD $2,R18,R18 // p[6]*4
+ ADD $1024,R10,R10 // tab[1]
+ MOVWZ 0(R17),R21 // tab[0][p[7]]
+ RLDICL $56,R9,$56,R19 // p[5]
+ ADD R10,R18,R18 // &tab[1][p[6]]
+ SLD $2,R19,R19 // p[5]*4:1
+ MOVWZ 0(R18),R22 // tab[1][p[6]]
+ ADD $1024,R10,R10 // tab[2]
+ XOR R21,R22,R21 // xor done R22
+ ADD R19,R10,R19 // &tab[2][p[5]]
+ ANDCC $255,R9,R20 // p[4] ??
+ SLD $2,R20,R20 // p[4]*4
+ MOVWZ 0(R19),R23 // tab[2][p[5]]
+ ADD $1024,R10,R10 // &tab[3]
+ ADD R20,R10,R20 // tab[3][p[4]]
+ XOR R21,R23,R21 // xor done R23
+ ADD $1024,R10,R10 // &tab[4]
+ MOVWZ 0(R20),R24 // tab[3][p[4]]
+ ADD R10,R8,R23 // &tab[4][crc>>24]
+ XOR R21,R24,R21 // xor done R24
+ MOVWZ 0(R23),R25 // tab[4][crc>>24]
+ RLDICL $48,R7,$56,R24 // crc>>16&0xFF
+ XOR R21,R25,R21 // xor done R25
+ ADD $1024,R10,R10 // &tab[5]
+ SLD $2,R24,R24 // crc>>16&0xFF*4
+ ADD R24,R10,R24 // &tab[5][crc>>16&0xFF]
+ MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF]
+ XOR R21,R26,R21 // xor done R26
+ RLDICL $56,R7,$56,R25 // crc>>8
+ ADD $1024,R10,R10 // &tab[6]
+ SLD $2,R25,R25 // crc>>8&FF*2
+ ADD R25,R10,R25 // &tab[6][crc>>8&0xFF]
+ MOVBZ R7,R26 // crc&0xFF
+ ADD $1024,R10,R10 // &tab[7]
+ MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF]
+ SLD $2,R26,R26 // crc&0xFF*2
+ XOR R21,R27,R21 // xor done R27
+ ADD R26,R10,R26 // &tab[7][crc&0xFF]
+ ADD $8,R5 // p = p[8:]
+ MOVWZ 0(R26),R28 // tab[7][crc&0xFF]
+ XOR R21,R28,R21 // xor done R28
+ MOVWZ R21,R7 // crc for next round
+ BC 16,0,loop // next 8 bytes
+ ANDCC $7,R6,R8 // any leftover bytes
+ BEQ done // none --> done
+ MOVD R8,CTR // byte count
+ PCALIGN $16 // align short loop
+short:
+ MOVBZ 0(R5),R8 // get v
+ MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE?
+ SRD $8,R7,R14 // crc>>8
+ XOR R8,R9,R8 // byte(crc)^v -> R8
+ ADD $1,R5 // ptr to next v
+ SLD $2,R8 // convert index-> bytes
+ ADD R8,R4,R9 // &tab[byte(crc)^v]
+ MOVWZ 0(R9),R10 // tab[byte(crc)^v]
+ XOR R10,R14,R7 // loop crc in R7
+ BC 16,0,short
+done:
+ NOR R7,R7,R7 // ^crc
+ MOVW R7,ret+40(FP) // return crc
+ RET
+
+#ifdef BYTESWAP_DATA
+DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
+DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
+
+GLOBL ·byteswapcons+0(SB),RODATA,$16
+#endif
+
+TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
+ MOVWZ crc+0(FP), R3 // incoming crc
+ MOVWZ ctab+4(FP), R14 // crc poly id
+ MOVD p+8(FP), R4
+ MOVD p_len+16(FP), R5 // p len
+
+ // R3 = incoming crc
+ // R14 = constant table identifier
+ // R5 = address of bytes
+ // R6 = length of bytes
+
+ // defines for index loads
+
+ MOVD $16,off16
+ MOVD $32,off32
+ MOVD $48,off48
+ MOVD $64,off64
+ MOVD $80,off80
+ MOVD $96,off96
+ MOVD $112,off112
+ MOVD $0,R15
+
+ MOVD R3,R10 // save initial crc
+
+ NOR R3,R3,R3 // ^crc
+ MOVWZ R3,R3 // 32 bits
+ VXOR zeroes,zeroes,zeroes // clear the V reg
+ VSPLTISW $-1,V0
+ VSLDOI $4,V29,V0,mask_32bit
+ VSLDOI $8,V29,V0,mask_64bit
+
+ VXOR V8,V8,V8
+ MTVSRD R3,VS40 // crc initial value VS40 = V8
+
+#ifdef REFLECT
+ VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits?
+#else
+ VSLDOI $4,V8,zeroes,V8
+#endif
+
+#ifdef BYTESWAP_DATA
+ MOVD $·byteswapcons(SB),R3
+ LVX (R3),byteswap
+#endif
+
+ CMPU R5,$256 // length of bytes
+ BLT short
+
+ RLDICR $0,R5,$56,R6 // chunk to process
+
+ // First step for larger sizes
+l1: MOVD $32768,R7
+ MOVD R7,R9
+ CMP R6,R7 // compare R6, R7 (MAX SIZE)
+ BGT top // less than MAX, just do remainder
+ MOVD R6,R7
+top:
+ SUB R7,R6,R6
+
+ // mainloop does 128 bytes at a time
+ SRD $7,R7
+
+ // determine the offset into the constants table to start with.
+ // Each constant is 128 bytes, used against 16 bytes of data.
+ SLD $4,R7,R8
+ SRD $3,R9,R9
+ SUB R8,R9,R8
+
+ // The last iteration is reduced in a separate step
+ ADD $-1,R7
+ MOVD R7,CTR
+
+ // Determine which constant table (depends on poly)
+ CMP R14,$1
+ BNE castTable
+ MOVD $·IEEEConst(SB),R3
+ BR startConst
+castTable:
+ MOVD $·CastConst(SB),R3
+
+startConst:
+ ADD R3,R8,R3 // starting point in constants table
+
+ VXOR V0,V0,V0 // clear the V regs
+ VXOR V1,V1,V1
+ VXOR V2,V2,V2
+ VXOR V3,V3,V3
+ VXOR V4,V4,V4
+ VXOR V5,V5,V5
+ VXOR V6,V6,V6
+ VXOR V7,V7,V7
+
+ LVX (R3),const1 // loading constant values
+
+ CMP R15,$1 // Identify warm up pass
+ BEQ next
+
+ // First warm up pass: load the bytes to process
+ LVX (R4),V16
+ LVX (R4+off16),V17
+ LVX (R4+off32),V18
+ LVX (R4+off48),V19
+ LVX (R4+off64),V20
+ LVX (R4+off80),V21
+ LVX (R4+off96),V22
+ LVX (R4+off112),V23
+ ADD $128,R4 // bump up to next 128 bytes in buffer
+
+ VXOR V16,V8,V16 // xor in initial CRC in V8
+
+next:
+ BC 18,0,first_warm_up_done
+
+ ADD $16,R3 // bump up to next constants
+ LVX (R3),const2 // table values
+
+ VPMSUMD V16,const1,V8 // second warm up pass
+ LVX (R4),V16 // load from buffer
+ OR $0,R2,R2
+
+ VPMSUMD V17,const1,V9 // vpmsumd with constants
+ LVX (R4+off16),V17 // load next from buffer
+ OR $0,R2,R2
+
+ VPMSUMD V18,const1,V10 // vpmsumd with constants
+ LVX (R4+off32),V18 // load next from buffer
+ OR $0,R2,R2
+
+ VPMSUMD V19,const1,V11 // vpmsumd with constants
+ LVX (R4+off48),V19 // load next from buffer
+ OR $0,R2,R2
+
+ VPMSUMD V20,const1,V12 // vpmsumd with constants
+ LVX (R4+off64),V20 // load next from buffer
+ OR $0,R2,R2
+
+ VPMSUMD V21,const1,V13 // vpmsumd with constants
+ LVX (R4+off80),V21 // load next from buffer
+ OR $0,R2,R2
+
+ VPMSUMD V22,const1,V14 // vpmsumd with constants
+ LVX (R4+off96),V22 // load next from buffer
+ OR $0,R2,R2
+
+ VPMSUMD V23,const1,V15 // vpmsumd with constants
+ LVX (R4+off112),V23 // load next from buffer
+
+ ADD $128,R4 // bump up to next 128 bytes in buffer
+
+ BC 18,0,first_cool_down
+
+cool_top:
+ LVX (R3),const1 // constants
+ ADD $16,R3 // inc to next constants
+ OR $0,R2,R2
+
+ VXOR V0,V8,V0 // xor in previous vpmsumd
+ VPMSUMD V16,const2,V8 // vpmsumd with constants
+ LVX (R4),V16 // buffer
+ OR $0,R2,R2
+
+ VXOR V1,V9,V1 // xor in previous
+ VPMSUMD V17,const2,V9 // vpmsumd with constants
+ LVX (R4+off16),V17 // next in buffer
+ OR $0,R2,R2
+
+ VXOR V2,V10,V2 // xor in previous
+ VPMSUMD V18,const2,V10 // vpmsumd with constants
+ LVX (R4+off32),V18 // next in buffer
+ OR $0,R2,R2
+
+ VXOR V3,V11,V3 // xor in previous
+ VPMSUMD V19,const2,V11 // vpmsumd with constants
+ LVX (R4+off48),V19 // next in buffer
+ LVX (R3),const2 // get next constant
+ OR $0,R2,R2
+
+ VXOR V4,V12,V4 // xor in previous
+ VPMSUMD V20,const1,V12 // vpmsumd with constants
+ LVX (R4+off64),V20 // next in buffer
+ OR $0,R2,R2
+
+ VXOR V5,V13,V5 // xor in previous
+ VPMSUMD V21,const1,V13 // vpmsumd with constants
+ LVX (R4+off80),V21 // next in buffer
+ OR $0,R2,R2
+
+ VXOR V6,V14,V6 // xor in previous
+ VPMSUMD V22,const1,V14 // vpmsumd with constants
+ LVX (R4+off96),V22 // next in buffer
+ OR $0,R2,R2
+
+ VXOR V7,V15,V7 // xor in previous
+ VPMSUMD V23,const1,V15 // vpmsumd with constants
+ LVX (R4+off112),V23 // next in buffer
+
+ ADD $128,R4 // bump up buffer pointer
+ BC 16,0,cool_top // are we done?
+
+first_cool_down:
+
+ // load the constants
+ // xor in the previous value
+ // vpmsumd the result with constants
+
+ LVX (R3),const1
+ ADD $16,R3
+
+ VXOR V0,V8,V0
+ VPMSUMD V16,const1,V8
+ OR $0,R2,R2
+
+ VXOR V1,V9,V1
+ VPMSUMD V17,const1,V9
+ OR $0,R2,R2
+
+ VXOR V2,V10,V2
+ VPMSUMD V18,const1,V10
+ OR $0,R2,R2
+
+ VXOR V3,V11,V3
+ VPMSUMD V19,const1,V11
+ OR $0,R2,R2
+
+ VXOR V4,V12,V4
+ VPMSUMD V20,const1,V12
+ OR $0,R2,R2
+
+ VXOR V5,V13,V5
+ VPMSUMD V21,const1,V13
+ OR $0,R2,R2
+
+ VXOR V6,V14,V6
+ VPMSUMD V22,const1,V14
+ OR $0,R2,R2
+
+ VXOR V7,V15,V7
+ VPMSUMD V23,const1,V15
+ OR $0,R2,R2
+
+second_cool_down:
+
+ VXOR V0,V8,V0
+ VXOR V1,V9,V1
+ VXOR V2,V10,V2
+ VXOR V3,V11,V3
+ VXOR V4,V12,V4
+ VXOR V5,V13,V5
+ VXOR V6,V14,V6
+ VXOR V7,V15,V7
+
+#ifdef REFLECT
+ VSLDOI $4,V0,zeroes,V0
+ VSLDOI $4,V1,zeroes,V1
+ VSLDOI $4,V2,zeroes,V2
+ VSLDOI $4,V3,zeroes,V3
+ VSLDOI $4,V4,zeroes,V4
+ VSLDOI $4,V5,zeroes,V5
+ VSLDOI $4,V6,zeroes,V6
+ VSLDOI $4,V7,zeroes,V7
+#endif
+
+ LVX (R4),V8
+ LVX (R4+off16),V9
+ LVX (R4+off32),V10
+ LVX (R4+off48),V11
+ LVX (R4+off64),V12
+ LVX (R4+off80),V13
+ LVX (R4+off96),V14
+ LVX (R4+off112),V15
+
+ ADD $128,R4
+
+ VXOR V0,V8,V16
+ VXOR V1,V9,V17
+ VXOR V2,V10,V18
+ VXOR V3,V11,V19
+ VXOR V4,V12,V20
+ VXOR V5,V13,V21
+ VXOR V6,V14,V22
+ VXOR V7,V15,V23
+
+ MOVD $1,R15
+ CMP $0,R6
+ ADD $128,R6
+
+ BNE l1
+ ANDCC $127,R5
+ SUBC R5,$128,R6
+ ADD R3,R6,R3
+
+ SRD $4,R5,R7
+ MOVD R7,CTR
+ LVX (R3),V0
+ LVX (R3+off16),V1
+ LVX (R3+off32),V2
+ LVX (R3+off48),V3
+ LVX (R3+off64),V4
+ LVX (R3+off80),V5
+ LVX (R3+off96),V6
+ LVX (R3+off112),V7
+
+ ADD $128,R3
+
+ VPMSUMW V16,V0,V0
+ VPMSUMW V17,V1,V1
+ VPMSUMW V18,V2,V2
+ VPMSUMW V19,V3,V3
+ VPMSUMW V20,V4,V4
+ VPMSUMW V21,V5,V5
+ VPMSUMW V22,V6,V6
+ VPMSUMW V23,V7,V7
+
+ // now reduce the tail
+
+ CMP $0,R7
+ BEQ next1
+
+ LVX (R4),V16
+ LVX (R3),V17
+ VPMSUMW V16,V17,V16
+ VXOR V0,V16,V0
+ BC 18,0,next1
+
+ LVX (R4+off16),V16
+ LVX (R3+off16),V17
+ VPMSUMW V16,V17,V16
+ VXOR V0,V16,V0
+ BC 18,0,next1
+
+ LVX (R4+off32),V16
+ LVX (R3+off32),V17
+ VPMSUMW V16,V17,V16
+ VXOR V0,V16,V0
+ BC 18,0,next1
+
+ LVX (R4+off48),V16
+ LVX (R3+off48),V17
+ VPMSUMW V16,V17,V16
+ VXOR V0,V16,V0
+ BC 18,0,next1
+
+ LVX (R4+off64),V16
+ LVX (R3+off64),V17
+ VPMSUMW V16,V17,V16
+ VXOR V0,V16,V0
+ BC 18,0,next1
+
+ LVX (R4+off80),V16
+ LVX (R3+off80),V17
+ VPMSUMW V16,V17,V16
+ VXOR V0,V16,V0
+ BC 18,0,next1
+
+ LVX (R4+off96),V16
+ LVX (R3+off96),V17
+ VPMSUMW V16,V17,V16
+ VXOR V0,V16,V0
+
+next1:
+ VXOR V0,V1,V0
+ VXOR V2,V3,V2
+ VXOR V4,V5,V4
+ VXOR V6,V7,V6
+ VXOR V0,V2,V0
+ VXOR V4,V6,V4
+ VXOR V0,V4,V0
+
+barrett_reduction:
+
+ CMP R14,$1
+ BNE barcstTable
+ MOVD $·IEEEBarConst(SB),R3
+ BR startbarConst
+barcstTable:
+ MOVD $·CastBarConst(SB),R3
+
+startbarConst:
+ LVX (R3),const1
+ LVX (R3+off16),const2
+
+ VSLDOI $8,V0,V0,V1
+ VXOR V0,V1,V0
+
+#ifdef REFLECT
+ VSPLTISB $1,V1
+ VSL V0,V1,V0
+#endif
+
+ VAND V0,mask_64bit,V0
+
+#ifndef REFLECT
+
+ VPMSUMD V0,const1,V1
+ VSLDOI $8,zeroes,V1,V1
+ VPMSUMD V1,const2,V1
+ VXOR V0,V1,V0
+ VSLDOI $8,V0,zeroes,V0
+
+#else
+
+ VAND V0,mask_32bit,V1
+ VPMSUMD V1,const1,V1
+ VAND V1,mask_32bit,V1
+ VPMSUMD V1,const2,V1
+ VXOR V0,V1,V0
+ VSLDOI $4,V0,zeroes,V0
+
+#endif
+
+ MFVSRD VS32,R3 // VS32 = V0
+
+ NOR R3,R3,R3 // return ^crc
+ MOVW R3,ret+32(FP)
+ RET
+
+first_warm_up_done:
+
+ LVX (R3),const1
+ ADD $16,R3
+
+ VPMSUMD V16,const1,V8
+ VPMSUMD V17,const1,V9
+ VPMSUMD V18,const1,V10
+ VPMSUMD V19,const1,V11
+ VPMSUMD V20,const1,V12
+ VPMSUMD V21,const1,V13
+ VPMSUMD V22,const1,V14
+ VPMSUMD V23,const1,V15
+
+ BR second_cool_down
+
+short:
+ CMP $0,R5
+ BEQ zero
+
+ // compute short constants
+
+ CMP R14,$1
+ BNE castshTable
+ MOVD $·IEEEConst(SB),R3
+ ADD $4080,R3
+ BR startshConst
+castshTable:
+ MOVD $·CastConst(SB),R3
+ ADD $4080,R3
+
+startshConst:
+ SUBC R5,$256,R6 // sub from 256
+ ADD R3,R6,R3
+
+ // calculate where to start
+
+ SRD $4,R5,R7
+ MOVD R7,CTR
+
+ VXOR V19,V19,V19
+ VXOR V20,V20,V20
+
+ LVX (R4),V0
+ LVX (R3),V16
+ VXOR V0,V8,V0
+ VPMSUMW V0,V16,V0
+ BC 18,0,v0
+
+ LVX (R4+off16),V1
+ LVX (R3+off16),V17
+ VPMSUMW V1,V17,V1
+ BC 18,0,v1
+
+ LVX (R4+off32),V2
+ LVX (R3+off32),V16
+ VPMSUMW V2,V16,V2
+ BC 18,0,v2
+
+ LVX (R4+off48),V3
+ LVX (R3+off48),V17
+ VPMSUMW V3,V17,V3
+ BC 18,0,v3
+
+ LVX (R4+off64),V4
+ LVX (R3+off64),V16
+ VPMSUMW V4,V16,V4
+ BC 18,0,v4
+
+ LVX (R4+off80),V5
+ LVX (R3+off80),V17
+ VPMSUMW V5,V17,V5
+ BC 18,0,v5
+
+ LVX (R4+off96),V6
+ LVX (R3+off96),V16
+ VPMSUMW V6,V16,V6
+ BC 18,0,v6
+
+ LVX (R4+off112),V7
+ LVX (R3+off112),V17
+ VPMSUMW V7,V17,V7
+ BC 18,0,v7
+
+ ADD $128,R3
+ ADD $128,R4
+
+ LVX (R4),V8
+ LVX (R3),V16
+ VPMSUMW V8,V16,V8
+ BC 18,0,v8
+
+ LVX (R4+off16),V9
+ LVX (R3+off16),V17
+ VPMSUMW V9,V17,V9
+ BC 18,0,v9
+
+ LVX (R4+off32),V10
+ LVX (R3+off32),V16
+ VPMSUMW V10,V16,V10
+ BC 18,0,v10
+
+ LVX (R4+off48),V11
+ LVX (R3+off48),V17
+ VPMSUMW V11,V17,V11
+ BC 18,0,v11
+
+ LVX (R4+off64),V12
+ LVX (R3+off64),V16
+ VPMSUMW V12,V16,V12
+ BC 18,0,v12
+
+ LVX (R4+off80),V13
+ LVX (R3+off80),V17
+ VPMSUMW V13,V17,V13
+ BC 18,0,v13
+
+ LVX (R4+off96),V14
+ LVX (R3+off96),V16
+ VPMSUMW V14,V16,V14
+ BC 18,0,v14
+
+ LVX (R4+off112),V15
+ LVX (R3+off112),V17
+ VPMSUMW V15,V17,V15
+
+ VXOR V19,V15,V19
+v14: VXOR V20,V14,V20
+v13: VXOR V19,V13,V19
+v12: VXOR V20,V12,V20
+v11: VXOR V19,V11,V19
+v10: VXOR V20,V10,V20
+v9: VXOR V19,V9,V19
+v8: VXOR V20,V8,V20
+v7: VXOR V19,V7,V19
+v6: VXOR V20,V6,V20
+v5: VXOR V19,V5,V19
+v4: VXOR V20,V4,V20
+v3: VXOR V19,V3,V19
+v2: VXOR V20,V2,V20
+v1: VXOR V19,V1,V19
+v0: VXOR V20,V0,V20
+
+ VXOR V19,V20,V0
+
+ BR barrett_reduction
+
+zero:
+ // This case is the original crc, so just return it
+ MOVW R10,ret+32(FP)
+ RET