diff options
Diffstat (limited to 'src/internal/bytealg/compare_ppc64x.s')
-rw-r--r-- | src/internal/bytealg/compare_ppc64x.s | 502 |
1 files changed, 502 insertions, 0 deletions
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s new file mode 100644 index 0000000..cbe0525 --- /dev/null +++ b/src/internal/bytealg/compare_ppc64x.s @@ -0,0 +1,502 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 + // incoming: + // R3 a addr -> R5 + // R4 a len -> R3 + // R5 a cap unused + // R6 b addr -> R6 + // R7 b len -> R4 + // R8 b cap unused + MOVD R3, R5 + MOVD R4, R3 + MOVD R7, R4 + CMP R5,R6,CR7 + CMP R3,R4,CR6 + BEQ CR7,equal + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + CMP R16,$1 + BNE power8 + BR cmpbodyp9<>(SB) +power8: + BR cmpbody<>(SB) +equal: + BEQ CR6,done + MOVD $1, R8 + BGT CR6,greater + NEG R8 +greater: + MOVD R8, R3 + RET +done: + MOVD $0, R3 + RET + +TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // incoming: + // R3 a addr -> R5 + // R4 a len -> R3 + // R5 b addr -> R6 + // R6 b len -> R4 + MOVD R6, R7 + MOVD R5, R6 + MOVD R3, R5 + MOVD R4, R3 + MOVD R7, R4 + CMP R5,R6,CR7 + CMP R3,R4,CR6 + BEQ CR7,equal + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + CMP R16,$1 + BNE power8 + BR cmpbodyp9<>(SB) +power8: + BR cmpbody<>(SB) +equal: + BEQ CR6,done + MOVD $1, R8 + BGT CR6,greater + NEG R8 +greater: + MOVD R8, R3 + RET + +done: + MOVD $0, R3 + RET + +#ifdef GOARCH_ppc64le +DATA byteswap<>+0(SB)/8, $0x0706050403020100 +DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 +GLOBL byteswap<>+0(SB), RODATA, $16 +#define SWAP V21 +#endif + +// Do an efficient memcmp for ppc64le/ppc64/POWER8 +// R3 = a len +// R4 = b len +// R5 = a addr +// R6 = b addr +// On exit: +// R3 = return value +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3,R8 // set up length + CMP R3,R4,CR2 // unequal? + BLT CR2,setuplen // BLT CR2 + MOVD R4,R8 // use R4 for comparison len +setuplen: + CMP R8,$32 // optimize >= 32 + MOVD R8,R9 + BLT setup8a // optimize < 32 + MOVD $16,R10 // set offsets to load into vectors + CMP R8,$64 + BLT cmp32 // process size 32-63 + + DCBT (R5) // optimize >= 64 + DCBT (R6) // cache hint + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + +loop64a:// process size 64 and greater + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different // jump out if its different + + LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector + LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector + LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector + LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $-64,R9,R9 // reduce remaining size by 64 + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + CMPU R9,$64 + BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining + + CMPU R9,$32 + BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining + CMPU R9,$0 + BNE rem // loop to rem if the remainder is not 0 + + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +cmp32: + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector + LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $-32,R9,R9 // reduce remaining size by 32 + ADD $32,R5,R5 // increment to next 32 bytes of A + ADD $32,R6,R6 // increment to next 32 bytes of B + CMPU R9,$0 + BNE rem // loop to rem if the remainder is not 0 + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +rem: + MOVD R9,R8 + ANDCC $24,R8,R9 // Any 8 byte chunks? + BEQ leftover // and result is 0 + BR setup8a + +different: +#ifdef GOARCH_ppc64le + MOVD $byteswap<>+00(SB), R16 + LXVD2X (R16)(R0),SWAP // Set up swap string + + VPERM V3,V3,SWAP,V3 + VPERM V4,V4,SWAP,V4 +#endif + MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison + MFVSRD VS36,R10 + + CMPU R16,R10 + BEQ lower + BGT greater + MOVD $-1,R3 // return value if A < B + RET +lower: + VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison + MFVSRD VS35,R16 + VSLDOI $8,V4,V4,V4 + MFVSRD VS36,R10 + + CMPU R16,R10 + BGT greater + MOVD $-1,R3 // return value if A < B + RET +setup8a: + SRADCC $3,R8,R9 // get the 8 byte count + BEQ leftover // shifted value is 0 + CMPU R8,$8 // optimize 8byte move + BEQ size8 + CMPU R8,$16 + BEQ size16 + MOVD R9,CTR // loop count for doublewords +loop8: +#ifdef GOARCH_ppc64le + MOVDBR (R5+R0),R16 // doublewords to compare + MOVDBR (R6+R0),R10 // LE compare order +#else + MOVD (R5+R0),R16 // doublewords to compare + MOVD (R6+R0),R10 // BE compare order +#endif + ADD $8,R5 + ADD $8,R6 + CMPU R16,R10 // match? + BC 8,2,loop8 // bt ctr <> 0 && cr + BGT greater + BLT less +leftover: + ANDCC $7,R8,R9 // check for leftover bytes + BEQ zeroremainder +simplecheck: + MOVD R0,R14 + CMP R9,$4 // process 4 bytes + BLT halfword +#ifdef GOARCH_ppc64le + MOVWBR (R5)(R14),R10 + MOVWBR (R6)(R14),R11 +#else + MOVWZ (R5)(R14),R10 + MOVWZ (R6)(R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $-4,R9 + ADD $4,R14 + PCALIGN $16 + +halfword: + CMP R9,$2 // process 2 bytes + BLT byte +#ifdef GOARCH_ppc64le + MOVHBR (R5)(R14),R10 + MOVHBR (R6)(R14),R11 +#else + MOVHZ (R5)(R14),R10 + MOVHZ (R6)(R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $-2,R9 + ADD $2,R14 + PCALIGN $16 +byte: + CMP R9,$0 // process 1 byte + BEQ skip + MOVBZ (R5)(R14),R10 + MOVBZ (R6)(R14),R11 + CMPU R10,R11 + BGT greater + BLT less + PCALIGN $16 +skip: + BEQ CR2,equal + BGT CR2,greater + +less: MOVD $-1,R3 // return value if A < B + RET +size16: + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different +zeroremainder: + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +size8: +#ifdef GOARCH_ppc64le + MOVDBR (R5+R0),R16 // doublewords to compare + MOVDBR (R6+R0),R10 // LE compare order +#else + MOVD (R5+R0),R16 // doublewords to compare + MOVD (R6+R0),R10 // BE compare order +#endif + CMPU R16,R10 // match? + BGT greater + BLT less + BGT CR2,greater // 2nd len > 1st len + BLT CR2,less // 2nd len < 1st len +equal: + MOVD $0, R3 // return value if A == B + RET +greater: + MOVD $1,R3 // return value if A > B + RET + +// Do an efficient memcmp for ppc64le/ppc64/POWER9 +// R3 = a len +// R4 = b len +// R5 = a addr +// R6 = b addr +// On exit: +// R3 = return value +TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 + MOVD R3,R8 // set up length + CMP R3,R4,CR2 // unequal? + BLT CR2,setuplen // BLT CR2 + MOVD R4,R8 // use R4 for comparison len +setuplen: + CMP R8,$16 // optimize for size<16 + MOVD R8,R9 + BLT simplecheck + MOVD $16,R10 // set offsets to load into vectors + CMP R8,$32 // optimize for size 16-31 + BLT cmp16 + CMP R8,$64 + BLT cmp32 // optimize for size 32-63 + DCBT (R5) // optimize for size>=64 + DCBT (R6) // cache hint + + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + +loop64a:// process size 64 and greater + LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector + VCMPNEBCC V3,V4,V1 // record comparison into V1 + BNE CR6,different // jump out if its different + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector + LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector + LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + ADD $-64,R9,R9 // reduce remaining size by 64 + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + CMPU R9,$64 + BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining + + CMPU R9,$32 + BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining + CMPU R9,$16 + BGE cmp16 // loop to cmp16 if there are 16-31 bytes left + CMPU R9,$0 + BNE simplecheck // loop to simplecheck for remaining bytes + + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +cmp32: + LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector + + VCMPNEBCC V3,V4,V1 // record comparison into V1 + BNE CR6,different // jump out if its different + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + ADD $-32,R9,R9 // reduce remaining size by 32 + ADD $32,R5,R5 // increment to next 32 bytes of A + ADD $32,R6,R6 // increment to next 32 bytes of B + CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left + BGE cmp16 + CMPU R9,$0 + BNE simplecheck // loop to simplecheck for remainder bytes + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +different: + + MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison + MFVSRD VS36,R10 + + CMPU R16,R10 + BEQ lower + BGT greater + MOVD $-1,R3 // return value if A < B + RET +lower: + MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison + MFVSRLD VS36,R10 + + CMPU R16,R10 + BGT greater + MOVD $-1,R3 // return value if A < B + RET + +greater: + MOVD $1,R3 // return value if A > B + RET +cmp16: + ANDCC $16,R9,R31 + BEQ tail + + LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $16,R5 + ADD $16,R6 +tail: + ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) + BEQ end + + ADD R9,R5 + ADD R9,R6 + MOVD $-16,R10 + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different +end: + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B) + BR greater // jump to greater otherwise +simplecheck: + MOVD $0,R14 // process 8 bytes + CMP R9,$8 + BLT word +#ifdef GOARCH_ppc64le + MOVDBR (R5+R14),R10 + MOVDBR (R6+R14),R11 +#else + MOVD (R5+R14),R10 + MOVD (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $8,R14 + ADD $-8,R9 + PCALIGN $16 +word: + CMP R9,$4 // process 4 bytes + BLT halfword +#ifdef GOARCH_ppc64le + MOVWBR (R5+R14),R10 + MOVWBR (R6+R14),R11 +#else + MOVWZ (R5+R14),R10 + MOVWZ (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $4,R14 + ADD $-4,R9 + PCALIGN $16 +halfword: + CMP R9,$2 // process 2 bytes + BLT byte +#ifdef GOARCH_ppc64le + MOVHBR (R5+R14),R10 + MOVHBR (R6+R14),R11 +#else + MOVHZ (R5+R14),R10 + MOVHZ (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $2,R14 + ADD $-2,R9 + PCALIGN $16 +byte: + CMP R9,$0 // process 1 byte + BEQ skip + MOVBZ (R5+R14),R10 + MOVBZ (R6+R14),R11 + CMPU R10,R11 + BGT greater + BLT less + PCALIGN $16 +skip: + BEQ CR2,equal + BGT CR2,greater +less: + MOVD $-1,R3 // return value if A < B + RET +equal: + MOVD $0, R3 // return value if A == B + RET |