diff options
Diffstat (limited to 'src/internal/bytealg/compare_ppc64x.s')
-rw-r--r-- | src/internal/bytealg/compare_ppc64x.s | 342 |
1 files changed, 342 insertions, 0 deletions
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s new file mode 100644 index 0000000..2629251 --- /dev/null +++ b/src/internal/bytealg/compare_ppc64x.s @@ -0,0 +1,342 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +#include "go_asm.h" +#include "textflag.h" + +// Helper names for x-form loads in BE ordering. +#ifdef GOARCH_ppc64le +#define _LDBEX MOVDBR +#define _LWBEX MOVWBR +#define _LHBEX MOVHBR +#else +#define _LDBEX MOVD +#define _LWBEX MOVW +#define _LHBEX MOVH +#endif + +#ifdef GOPPC64_power9 +#define SETB_CR0(rout) SETB CR0, rout +#define SETB_CR1(rout) SETB CR1, rout +#define SETB_INIT() +#define SETB_CR0_NE(rout) SETB_CR0(rout) +#else +// A helper macro to emulate SETB on P8. This assumes +// -1 is in R20, and 1 is in R21. crxlt and crxeq must +// also be the same CR field. +#define _SETB(crxlt, crxeq, rout) \ + ISEL crxeq,R0,R21,rout \ + ISEL crxlt,R20,rout,rout + +// A special case when it is know the comparison +// will always be not equal. The result must be -1 or 1. +#define SETB_CR0_NE(rout) \ + ISEL CR0LT,R20,R21,rout + +#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout) +#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout) +#define SETB_INIT() \ + MOVD $-1,R20 \ + MOVD $1,R21 +#endif + +TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 + // incoming: + // R3 a addr + // R4 a len + // R6 b addr + // R7 b len + // + // on entry to cmpbody: + // R3 return value if len(a) == len(b) + // R5 a addr + // R6 b addr + // R9 min(len(a),len(b)) + SETB_INIT() + MOVD R3,R5 + CMP R4,R7,CR0 + CMP R3,R6,CR7 + ISEL CR0LT,R4,R7,R9 + SETB_CR0(R3) + BC $12,30,LR // beqlr cr7 + BR cmpbody<>(SB) + +TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // incoming: + // R3 a addr -> R5 + // R4 a len -> R3 + // R5 b addr -> R6 + // R6 b len -> R4 + // + // on entry to cmpbody: + // R3 compare value if compared length is same. + // R5 a addr + // R6 b addr + // R9 min(len(a),len(b)) + SETB_INIT() + CMP R4,R6,CR0 + CMP R3,R5,CR7 + ISEL CR0LT,R4,R6,R9 + MOVD R5,R6 + MOVD R3,R5 + SETB_CR0(R3) + BC $12,30,LR // beqlr cr7 + BR cmpbody<>(SB) + +#ifdef GOARCH_ppc64le +DATA byteswap<>+0(SB)/8, $0x0706050403020100 +DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 +GLOBL byteswap<>+0(SB), RODATA, $16 +#define SWAP V21 +#endif + +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 +start: + CMP R9,$16,CR0 + CMP R9,$32,CR1 + CMP R9,$64,CR2 + MOVD $16,R10 + BLT cmp8 + BLT CR1,cmp16 + BLT CR2,cmp32 + +cmp64: // >= 64B + DCBT (R5) // optimize for size>=64 + DCBT (R6) // cache hint + + SRD $6,R9,R14 // There is at least one iteration. + MOVD R14,CTR + ANDCC $63,R9,R9 + CMP R9,$16,CR1 // Do setup for tail check early on. + CMP R9,$32,CR2 + CMP R9,$48,CR3 + ADD $-16,R9,R9 + + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + + PCALIGN $16 +cmp64_loop: + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different // jump out if its different + + LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector + LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector + LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector + LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + BDNZ cmp64_loop + BC $12,2,LR // beqlr + + // Finish out tail with minimal overlapped checking. + // Note, 0 tail is handled by beqlr above. + BLE CR1,cmp64_tail_gt0 + BLE CR2,cmp64_tail_gt16 + BLE CR3,cmp64_tail_gt32 + +cmp64_tail_gt48: // 49 - 63 B + LXVD2X (R0)(R5),V3 + LXVD2X (R0)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R10),V3 + LXVD2X (R6)(R10),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R11),V3 + LXVD2X (R6)(R11),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + BR cmp64_tail_gt0 + + PCALIGN $16 +cmp64_tail_gt32: // 33 - 48B + LXVD2X (R0)(R5),V3 + LXVD2X (R0)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R10),V3 + LXVD2X (R6)(R10),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + BR cmp64_tail_gt0 + + PCALIGN $16 +cmp64_tail_gt16: // 17 - 32B + LXVD2X (R0)(R5),V3 + LXVD2X (R0)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + BR cmp64_tail_gt0 + + PCALIGN $16 +cmp64_tail_gt0: // 1 - 16B + LXVD2X (R5)(R9),V3 + LXVD2X (R6)(R9),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + RET + + PCALIGN $16 +cmp32: // 32 - 63B + ANDCC $31,R9,R9 + + LXVD2X (R0)(R5),V3 + LXVD2X (R0)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R10)(R5),V3 + LXVD2X (R10)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + BC $12,2,LR // beqlr + ADD R9,R10,R10 + + LXVD2X (R9)(R5),V3 + LXVD2X (R9)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R10)(R5),V3 + LXVD2X (R10)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + RET + + PCALIGN $16 +cmp16: // 16 - 31B + ANDCC $15,R9,R9 + LXVD2X (R0)(R5),V3 + LXVD2X (R0)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + BC $12,2,LR // beqlr + + LXVD2X (R9)(R5),V3 + LXVD2X (R9)(R6),V4 + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + RET + + PCALIGN $16 +different: +#ifdef GOARCH_ppc64le + MOVD $byteswap<>+00(SB),R16 + LXVD2X (R16)(R0),SWAP // Set up swap string + + VPERM V3,V3,SWAP,V3 + VPERM V4,V4,SWAP,V4 +#endif + + MFVSRD VS35,R16 // move upper doublewords of A and B into GPR for comparison + MFVSRD VS36,R10 + + CMPU R16,R10 + BEQ lower + SETB_CR0_NE(R3) + RET + + PCALIGN $16 +lower: + VSLDOI $8,V3,V3,V3 // move lower doublewords of A and B into GPR for comparison + MFVSRD VS35,R16 + VSLDOI $8,V4,V4,V4 + MFVSRD VS36,R10 + + CMPU R16,R10 + SETB_CR0_NE(R3) + RET + + PCALIGN $16 +cmp8: // 8 - 15B (0 - 15B if GOPPC64_power10) +#ifdef GOPPC64_power10 + SLD $56,R9,R9 + LXVLL R5,R9,V3 // Load bytes starting from MSB to LSB, unused are zero filled. + LXVLL R6,R9,V4 + VCMPUQ V3,V4,CR0 // Compare as a 128b integer. + SETB_CR0(R6) + ISEL CR0EQ,R3,R6,R3 // If equal, length determines the return value. + RET +#else + CMP R9,$8 + BLT cmp4 + ANDCC $7,R9,R9 + _LDBEX (R0)(R5),R10 + _LDBEX (R0)(R6),R11 + _LDBEX (R9)(R5),R12 + _LDBEX (R9)(R6),R14 + CMPU R10,R11,CR0 + SETB_CR0(R5) + CMPU R12,R14,CR1 + SETB_CR1(R6) + CRAND CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value. + ISEL CR0EQ,R6,R5,R4 + ISEL CR1EQ,R3,R4,R3 + RET + + PCALIGN $16 +cmp4: // 4 - 7B + CMP R9,$4 + BLT cmp2 + ANDCC $3,R9,R9 + _LWBEX (R0)(R5),R10 + _LWBEX (R0)(R6),R11 + _LWBEX (R9)(R5),R12 + _LWBEX (R9)(R6),R14 + RLDIMI $32,R10,$0,R12 + RLDIMI $32,R11,$0,R14 + CMPU R12,R14 + BR cmp0 + + PCALIGN $16 +cmp2: // 2 - 3B + CMP R9,$2 + BLT cmp1 + ANDCC $1,R9,R9 + _LHBEX (R0)(R5),R10 + _LHBEX (R0)(R6),R11 + _LHBEX (R9)(R5),R12 + _LHBEX (R9)(R6),R14 + RLDIMI $32,R10,$0,R12 + RLDIMI $32,R11,$0,R14 + CMPU R12,R14 + BR cmp0 + + PCALIGN $16 +cmp1: + CMP R9,$0 + BEQ cmp0 + MOVBZ (R5),R10 + MOVBZ (R6),R11 + CMPU R10,R11 +cmp0: + SETB_CR0(R6) + ISEL CR0EQ,R3,R6,R3 + RET +#endif |