diff options
Diffstat (limited to 'src/hash/crc32/crc32_ppc64le.s')
-rw-r--r-- | src/hash/crc32/crc32_ppc64le.s | 707 |
1 files changed, 707 insertions, 0 deletions
diff --git a/src/hash/crc32/crc32_ppc64le.s b/src/hash/crc32/crc32_ppc64le.s new file mode 100644 index 0000000..10d5dd6 --- /dev/null +++ b/src/hash/crc32/crc32_ppc64le.s @@ -0,0 +1,707 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The vectorized implementation found below is a derived work +// from code written by Anton Blanchard <anton@au.ibm.com> found +// at https://github.com/antonblanchard/crc32-vpmsum. The original +// is dual licensed under GPL and Apache 2. As the copyright holder +// for the work, IBM has contributed this new work under +// the golang license. + +// Changes include porting to Go assembler with modifications for +// the Go ABI for ppc64le. + +#include "textflag.h" + +#define POWER8_OFFSET 132 + +#define off16 R16 +#define off32 R17 +#define off48 R18 +#define off64 R19 +#define off80 R20 +#define off96 R21 +#define off112 R22 + +#define const1 V24 +#define const2 V25 + +#define byteswap V26 +#define mask_32bit V27 +#define mask_64bit V28 +#define zeroes V29 + +#define MAX_SIZE 32*1024 +#define REFLECT + +TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44 + MOVWZ crc+0(FP), R3 // incoming crc + MOVD table8+8(FP), R4 // *Table + MOVD p+16(FP), R5 + MOVD p_len+24(FP), R6 // p len + + CMP $0,R6 // len == 0? + BNE start + MOVW R3,ret+40(FP) // return crc + RET + +start: + NOR R3,R3,R7 // ^crc + MOVWZ R7,R7 // 32 bits + CMP R6,$16 + MOVD R6,CTR + BLT short + SRAD $3,R6,R8 // 8 byte chunks + MOVD R8,CTR + +loop: + MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian? + MOVWZ 4(R5),R9 // 4-7 bytes of p + MOVD R4,R10 // &tab[0] + XOR R7,R8,R7 // crc ^= byte[0:3] + RLDICL $40,R9,$56,R17 // p[7] + SLD $2,R17,R17 // p[7]*4 + RLDICL $40,R7,$56,R8 // crc>>24 + ADD R17,R10,R17 // &tab[0][p[7]] + SLD $2,R8,R8 // crc>>24*4 + RLDICL $48,R9,$56,R18 // p[6] + SLD $2,R18,R18 // p[6]*4 + ADD $1024,R10,R10 // tab[1] + MOVWZ 0(R17),R21 // tab[0][p[7]] + RLDICL $56,R9,$56,R19 // p[5] + ADD R10,R18,R18 // &tab[1][p[6]] + SLD $2,R19,R19 // p[5]*4:1 + MOVWZ 0(R18),R22 // tab[1][p[6]] + ADD $1024,R10,R10 // tab[2] + XOR R21,R22,R21 // xor done R22 + ADD R19,R10,R19 // &tab[2][p[5]] + ANDCC $255,R9,R20 // p[4] ?? + SLD $2,R20,R20 // p[4]*4 + MOVWZ 0(R19),R23 // tab[2][p[5]] + ADD $1024,R10,R10 // &tab[3] + ADD R20,R10,R20 // tab[3][p[4]] + XOR R21,R23,R21 // xor done R23 + ADD $1024,R10,R10 // &tab[4] + MOVWZ 0(R20),R24 // tab[3][p[4]] + ADD R10,R8,R23 // &tab[4][crc>>24] + XOR R21,R24,R21 // xor done R24 + MOVWZ 0(R23),R25 // tab[4][crc>>24] + RLDICL $48,R7,$56,R24 // crc>>16&0xFF + XOR R21,R25,R21 // xor done R25 + ADD $1024,R10,R10 // &tab[5] + SLD $2,R24,R24 // crc>>16&0xFF*4 + ADD R24,R10,R24 // &tab[5][crc>>16&0xFF] + MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF] + XOR R21,R26,R21 // xor done R26 + RLDICL $56,R7,$56,R25 // crc>>8 + ADD $1024,R10,R10 // &tab[6] + SLD $2,R25,R25 // crc>>8&FF*2 + ADD R25,R10,R25 // &tab[6][crc>>8&0xFF] + MOVBZ R7,R26 // crc&0xFF + ADD $1024,R10,R10 // &tab[7] + MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF] + SLD $2,R26,R26 // crc&0xFF*2 + XOR R21,R27,R21 // xor done R27 + ADD R26,R10,R26 // &tab[7][crc&0xFF] + ADD $8,R5 // p = p[8:] + MOVWZ 0(R26),R28 // tab[7][crc&0xFF] + XOR R21,R28,R21 // xor done R28 + MOVWZ R21,R7 // crc for next round + BC 16,0,loop // next 8 bytes + ANDCC $7,R6,R8 // any leftover bytes + BEQ done // none --> done + MOVD R8,CTR // byte count + +short: + MOVBZ 0(R5),R8 // get v + MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE? + MOVWZ R7,R14 + SRD $8,R14,R14 // crc>>8 + XOR R8,R9,R8 // byte(crc)^v -> R8 + ADD $1,R5 // ptr to next v + SLD $2,R8 // convert index-> bytes + ADD R8,R4,R9 // &tab[byte(crc)^v] + MOVWZ 0(R9),R10 // tab[byte(crc)^v] + XOR R10,R14,R7 // loop crc in R7 + MOVWZ R7,R7 // 32 bits + BC 16,0,short +done: + NOR R7,R7,R7 // ^crc + MOVW R7,ret+40(FP) // return crc + RET + +#ifdef BYTESWAP_DATA +DATA ·byteswapcons+0(SB)/8,$0x0706050403020100 +DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908 + +GLOBL ·byteswapcons+0(SB),RODATA,$16 +#endif + +TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36 + MOVWZ crc+0(FP), R3 // incoming crc + MOVWZ ctab+4(FP), R14 // crc poly id + MOVD p+8(FP), R4 + MOVD p_len+16(FP), R5 // p len + + // R3 = incoming crc + // R14 = constant table identifier + // R5 = address of bytes + // R6 = length of bytes + + // defines for index loads + + MOVD $16,off16 + MOVD $32,off32 + MOVD $48,off48 + MOVD $64,off64 + MOVD $80,off80 + MOVD $96,off96 + MOVD $112,off112 + MOVD $0,R15 + + MOVD R3,R10 // save initial crc + + NOR R3,R3,R3 // ^crc + MOVWZ R3,R3 // 32 bits + VXOR zeroes,zeroes,zeroes // clear the V reg + VSPLTISW $-1,V0 + VSLDOI $4,V29,V0,mask_32bit + VSLDOI $8,V29,V0,mask_64bit + + VXOR V8,V8,V8 + MTVSRD R3,VS40 // crc initial value VS40 = V8 + +#ifdef REFLECT + VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits? +#else + VSLDOI $4,V8,zeroes,V8 +#endif + +#ifdef BYTESWAP_DATA + MOVD $·byteswapcons(SB),R3 + LVX (R3),byteswap +#endif + + CMPU R5,$256 // length of bytes + BLT short + + RLDICR $0,R5,$56,R6 // chunk to process + + // First step for larger sizes +l1: MOVD $32768,R7 + MOVD R7,R9 + CMP R6,R7 // compare R6, R7 (MAX SIZE) + BGT top // less than MAX, just do remainder + MOVD R6,R7 +top: + SUB R7,R6,R6 + + // mainloop does 128 bytes at a time + SRD $7,R7 + + // determine the offset into the constants table to start with. + // Each constant is 128 bytes, used against 16 bytes of data. + SLD $4,R7,R8 + SRD $3,R9,R9 + SUB R8,R9,R8 + + // The last iteration is reduced in a separate step + ADD $-1,R7 + MOVD R7,CTR + + // Determine which constant table (depends on poly) + CMP R14,$1 + BNE castTable + MOVD $·IEEEConst(SB),R3 + BR startConst +castTable: + MOVD $·CastConst(SB),R3 + +startConst: + ADD R3,R8,R3 // starting point in constants table + + VXOR V0,V0,V0 // clear the V regs + VXOR V1,V1,V1 + VXOR V2,V2,V2 + VXOR V3,V3,V3 + VXOR V4,V4,V4 + VXOR V5,V5,V5 + VXOR V6,V6,V6 + VXOR V7,V7,V7 + + LVX (R3),const1 // loading constant values + + CMP R15,$1 // Identify warm up pass + BEQ next + + // First warm up pass: load the bytes to process + LVX (R4),V16 + LVX (R4+off16),V17 + LVX (R4+off32),V18 + LVX (R4+off48),V19 + LVX (R4+off64),V20 + LVX (R4+off80),V21 + LVX (R4+off96),V22 + LVX (R4+off112),V23 + ADD $128,R4 // bump up to next 128 bytes in buffer + + VXOR V16,V8,V16 // xor in initial CRC in V8 + +next: + BC 18,0,first_warm_up_done + + ADD $16,R3 // bump up to next constants + LVX (R3),const2 // table values + + VPMSUMD V16,const1,V8 // second warm up pass + LVX (R4),V16 // load from buffer + OR $0,R2,R2 + + VPMSUMD V17,const1,V9 // vpmsumd with constants + LVX (R4+off16),V17 // load next from buffer + OR $0,R2,R2 + + VPMSUMD V18,const1,V10 // vpmsumd with constants + LVX (R4+off32),V18 // load next from buffer + OR $0,R2,R2 + + VPMSUMD V19,const1,V11 // vpmsumd with constants + LVX (R4+off48),V19 // load next from buffer + OR $0,R2,R2 + + VPMSUMD V20,const1,V12 // vpmsumd with constants + LVX (R4+off64),V20 // load next from buffer + OR $0,R2,R2 + + VPMSUMD V21,const1,V13 // vpmsumd with constants + LVX (R4+off80),V21 // load next from buffer + OR $0,R2,R2 + + VPMSUMD V22,const1,V14 // vpmsumd with constants + LVX (R4+off96),V22 // load next from buffer + OR $0,R2,R2 + + VPMSUMD V23,const1,V15 // vpmsumd with constants + LVX (R4+off112),V23 // load next from buffer + + ADD $128,R4 // bump up to next 128 bytes in buffer + + BC 18,0,first_cool_down + +cool_top: + LVX (R3),const1 // constants + ADD $16,R3 // inc to next constants + OR $0,R2,R2 + + VXOR V0,V8,V0 // xor in previous vpmsumd + VPMSUMD V16,const2,V8 // vpmsumd with constants + LVX (R4),V16 // buffer + OR $0,R2,R2 + + VXOR V1,V9,V1 // xor in previous + VPMSUMD V17,const2,V9 // vpmsumd with constants + LVX (R4+off16),V17 // next in buffer + OR $0,R2,R2 + + VXOR V2,V10,V2 // xor in previous + VPMSUMD V18,const2,V10 // vpmsumd with constants + LVX (R4+off32),V18 // next in buffer + OR $0,R2,R2 + + VXOR V3,V11,V3 // xor in previous + VPMSUMD V19,const2,V11 // vpmsumd with constants + LVX (R4+off48),V19 // next in buffer + LVX (R3),const2 // get next constant + OR $0,R2,R2 + + VXOR V4,V12,V4 // xor in previous + VPMSUMD V20,const1,V12 // vpmsumd with constants + LVX (R4+off64),V20 // next in buffer + OR $0,R2,R2 + + VXOR V5,V13,V5 // xor in previous + VPMSUMD V21,const1,V13 // vpmsumd with constants + LVX (R4+off80),V21 // next in buffer + OR $0,R2,R2 + + VXOR V6,V14,V6 // xor in previous + VPMSUMD V22,const1,V14 // vpmsumd with constants + LVX (R4+off96),V22 // next in buffer + OR $0,R2,R2 + + VXOR V7,V15,V7 // xor in previous + VPMSUMD V23,const1,V15 // vpmsumd with constants + LVX (R4+off112),V23 // next in buffer + + ADD $128,R4 // bump up buffer pointer + BC 16,0,cool_top // are we done? + +first_cool_down: + + // load the constants + // xor in the previous value + // vpmsumd the result with constants + + LVX (R3),const1 + ADD $16,R3 + + VXOR V0,V8,V0 + VPMSUMD V16,const1,V8 + OR $0,R2,R2 + + VXOR V1,V9,V1 + VPMSUMD V17,const1,V9 + OR $0,R2,R2 + + VXOR V2,V10,V2 + VPMSUMD V18,const1,V10 + OR $0,R2,R2 + + VXOR V3,V11,V3 + VPMSUMD V19,const1,V11 + OR $0,R2,R2 + + VXOR V4,V12,V4 + VPMSUMD V20,const1,V12 + OR $0,R2,R2 + + VXOR V5,V13,V5 + VPMSUMD V21,const1,V13 + OR $0,R2,R2 + + VXOR V6,V14,V6 + VPMSUMD V22,const1,V14 + OR $0,R2,R2 + + VXOR V7,V15,V7 + VPMSUMD V23,const1,V15 + OR $0,R2,R2 + +second_cool_down: + + VXOR V0,V8,V0 + VXOR V1,V9,V1 + VXOR V2,V10,V2 + VXOR V3,V11,V3 + VXOR V4,V12,V4 + VXOR V5,V13,V5 + VXOR V6,V14,V6 + VXOR V7,V15,V7 + +#ifdef REFLECT + VSLDOI $4,V0,zeroes,V0 + VSLDOI $4,V1,zeroes,V1 + VSLDOI $4,V2,zeroes,V2 + VSLDOI $4,V3,zeroes,V3 + VSLDOI $4,V4,zeroes,V4 + VSLDOI $4,V5,zeroes,V5 + VSLDOI $4,V6,zeroes,V6 + VSLDOI $4,V7,zeroes,V7 +#endif + + LVX (R4),V8 + LVX (R4+off16),V9 + LVX (R4+off32),V10 + LVX (R4+off48),V11 + LVX (R4+off64),V12 + LVX (R4+off80),V13 + LVX (R4+off96),V14 + LVX (R4+off112),V15 + + ADD $128,R4 + + VXOR V0,V8,V16 + VXOR V1,V9,V17 + VXOR V2,V10,V18 + VXOR V3,V11,V19 + VXOR V4,V12,V20 + VXOR V5,V13,V21 + VXOR V6,V14,V22 + VXOR V7,V15,V23 + + MOVD $1,R15 + CMP $0,R6 + ADD $128,R6 + + BNE l1 + ANDCC $127,R5 + SUBC R5,$128,R6 + ADD R3,R6,R3 + + SRD $4,R5,R7 + MOVD R7,CTR + LVX (R3),V0 + LVX (R3+off16),V1 + LVX (R3+off32),V2 + LVX (R3+off48),V3 + LVX (R3+off64),V4 + LVX (R3+off80),V5 + LVX (R3+off96),V6 + LVX (R3+off112),V7 + + ADD $128,R3 + + VPMSUMW V16,V0,V0 + VPMSUMW V17,V1,V1 + VPMSUMW V18,V2,V2 + VPMSUMW V19,V3,V3 + VPMSUMW V20,V4,V4 + VPMSUMW V21,V5,V5 + VPMSUMW V22,V6,V6 + VPMSUMW V23,V7,V7 + + // now reduce the tail + + CMP $0,R7 + BEQ next1 + + LVX (R4),V16 + LVX (R3),V17 + VPMSUMW V16,V17,V16 + VXOR V0,V16,V0 + BC 18,0,next1 + + LVX (R4+off16),V16 + LVX (R3+off16),V17 + VPMSUMW V16,V17,V16 + VXOR V0,V16,V0 + BC 18,0,next1 + + LVX (R4+off32),V16 + LVX (R3+off32),V17 + VPMSUMW V16,V17,V16 + VXOR V0,V16,V0 + BC 18,0,next1 + + LVX (R4+off48),V16 + LVX (R3+off48),V17 + VPMSUMW V16,V17,V16 + VXOR V0,V16,V0 + BC 18,0,next1 + + LVX (R4+off64),V16 + LVX (R3+off64),V17 + VPMSUMW V16,V17,V16 + VXOR V0,V16,V0 + BC 18,0,next1 + + LVX (R4+off80),V16 + LVX (R3+off80),V17 + VPMSUMW V16,V17,V16 + VXOR V0,V16,V0 + BC 18,0,next1 + + LVX (R4+off96),V16 + LVX (R3+off96),V17 + VPMSUMW V16,V17,V16 + VXOR V0,V16,V0 + +next1: + VXOR V0,V1,V0 + VXOR V2,V3,V2 + VXOR V4,V5,V4 + VXOR V6,V7,V6 + VXOR V0,V2,V0 + VXOR V4,V6,V4 + VXOR V0,V4,V0 + +barrett_reduction: + + CMP R14,$1 + BNE barcstTable + MOVD $·IEEEBarConst(SB),R3 + BR startbarConst +barcstTable: + MOVD $·CastBarConst(SB),R3 + +startbarConst: + LVX (R3),const1 + LVX (R3+off16),const2 + + VSLDOI $8,V0,V0,V1 + VXOR V0,V1,V0 + +#ifdef REFLECT + VSPLTISB $1,V1 + VSL V0,V1,V0 +#endif + + VAND V0,mask_64bit,V0 + +#ifndef REFLECT + + VPMSUMD V0,const1,V1 + VSLDOI $8,zeroes,V1,V1 + VPMSUMD V1,const2,V1 + VXOR V0,V1,V0 + VSLDOI $8,V0,zeroes,V0 + +#else + + VAND V0,mask_32bit,V1 + VPMSUMD V1,const1,V1 + VAND V1,mask_32bit,V1 + VPMSUMD V1,const2,V1 + VXOR V0,V1,V0 + VSLDOI $4,V0,zeroes,V0 + +#endif + + MFVSRD VS32,R3 // VS32 = V0 + + NOR R3,R3,R3 // return ^crc + MOVW R3,ret+32(FP) + RET + +first_warm_up_done: + + LVX (R3),const1 + ADD $16,R3 + + VPMSUMD V16,const1,V8 + VPMSUMD V17,const1,V9 + VPMSUMD V18,const1,V10 + VPMSUMD V19,const1,V11 + VPMSUMD V20,const1,V12 + VPMSUMD V21,const1,V13 + VPMSUMD V22,const1,V14 + VPMSUMD V23,const1,V15 + + BR second_cool_down + +short: + CMP $0,R5 + BEQ zero + + // compute short constants + + CMP R14,$1 + BNE castshTable + MOVD $·IEEEConst(SB),R3 + ADD $4080,R3 + BR startshConst +castshTable: + MOVD $·CastConst(SB),R3 + ADD $4080,R3 + +startshConst: + SUBC R5,$256,R6 // sub from 256 + ADD R3,R6,R3 + + // calculate where to start + + SRD $4,R5,R7 + MOVD R7,CTR + + VXOR V19,V19,V19 + VXOR V20,V20,V20 + + LVX (R4),V0 + LVX (R3),V16 + VXOR V0,V8,V0 + VPMSUMW V0,V16,V0 + BC 18,0,v0 + + LVX (R4+off16),V1 + LVX (R3+off16),V17 + VPMSUMW V1,V17,V1 + BC 18,0,v1 + + LVX (R4+off32),V2 + LVX (R3+off32),V16 + VPMSUMW V2,V16,V2 + BC 18,0,v2 + + LVX (R4+off48),V3 + LVX (R3+off48),V17 + VPMSUMW V3,V17,V3 + BC 18,0,v3 + + LVX (R4+off64),V4 + LVX (R3+off64),V16 + VPMSUMW V4,V16,V4 + BC 18,0,v4 + + LVX (R4+off80),V5 + LVX (R3+off80),V17 + VPMSUMW V5,V17,V5 + BC 18,0,v5 + + LVX (R4+off96),V6 + LVX (R3+off96),V16 + VPMSUMW V6,V16,V6 + BC 18,0,v6 + + LVX (R4+off112),V7 + LVX (R3+off112),V17 + VPMSUMW V7,V17,V7 + BC 18,0,v7 + + ADD $128,R3 + ADD $128,R4 + + LVX (R4),V8 + LVX (R3),V16 + VPMSUMW V8,V16,V8 + BC 18,0,v8 + + LVX (R4+off16),V9 + LVX (R3+off16),V17 + VPMSUMW V9,V17,V9 + BC 18,0,v9 + + LVX (R4+off32),V10 + LVX (R3+off32),V16 + VPMSUMW V10,V16,V10 + BC 18,0,v10 + + LVX (R4+off48),V11 + LVX (R3+off48),V17 + VPMSUMW V11,V17,V11 + BC 18,0,v11 + + LVX (R4+off64),V12 + LVX (R3+off64),V16 + VPMSUMW V12,V16,V12 + BC 18,0,v12 + + LVX (R4+off80),V13 + LVX (R3+off80),V17 + VPMSUMW V13,V17,V13 + BC 18,0,v13 + + LVX (R4+off96),V14 + LVX (R3+off96),V16 + VPMSUMW V14,V16,V14 + BC 18,0,v14 + + LVX (R4+off112),V15 + LVX (R3+off112),V17 + VPMSUMW V15,V17,V15 + + VXOR V19,V15,V19 +v14: VXOR V20,V14,V20 +v13: VXOR V19,V13,V19 +v12: VXOR V20,V12,V20 +v11: VXOR V19,V11,V19 +v10: VXOR V20,V10,V20 +v9: VXOR V19,V9,V19 +v8: VXOR V20,V8,V20 +v7: VXOR V19,V7,V19 +v6: VXOR V20,V6,V20 +v5: VXOR V19,V5,V19 +v4: VXOR V20,V4,V20 +v3: VXOR V19,V3,V19 +v2: VXOR V20,V2,V20 +v1: VXOR V19,V1,V19 +v0: VXOR V20,V0,V20 + + VXOR V19,V20,V0 + + BR barrett_reduction + +zero: + // This case is the original crc, so just return it + MOVW R10,ret+32(FP) + RET |