diff options
Diffstat (limited to 'src/internal/bytealg/indexbyte_ppc64x.s')
-rw-r--r-- | src/internal/bytealg/indexbyte_ppc64x.s | 314 |
1 files changed, 314 insertions, 0 deletions
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s new file mode 100644 index 0000000..b6714f4 --- /dev/null +++ b/src/internal/bytealg/indexbyte_ppc64x.s @@ -0,0 +1,314 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64 || ppc64le + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // R3 = byte array pointer + // R4 = length + MOVD R6, R5 // R5 = byte + BR indexbytebody<>(SB) + +TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32 + // R3 = string + // R4 = length + // R5 = byte + BR indexbytebody<>(SB) + +#ifndef GOPPC64_power9 +#ifdef GOARCH_ppc64le +DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800 +DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840 +#else +DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038 +DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078 +#endif +GLOBL indexbytevbperm<>+0(SB), RODATA, $16 +#endif + +// Some operations are endian specific, choose the correct opcode base on GOARCH. +// Note, _VCZBEBB is only available on power9 and newer. +#ifdef GOARCH_ppc64le +#define _LDBEX MOVDBR +#define _LWBEX MOVWBR +#define _LHBEX MOVHBR +#define _VCZBEBB VCTZLSBB +#else +#define _LDBEX MOVD +#define _LWBEX MOVW +#define _LHBEX MOVH +#define _VCZBEBB VCLZLSBB +#endif + +// R3 = addr of string +// R4 = len of string +// R5 = byte to find +// On exit: +// R3 = return value +TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 + CMPU R4,$32 + +#ifndef GOPPC64_power9 + // Load VBPERMQ constant to reduce compare into an ordered bit mask. + MOVD $indexbytevbperm<>+00(SB),R16 + LXVD2X (R16),V0 // Set up swap string +#endif + + MTVRD R5,V1 + VSPLTB $7,V1,V1 // Replicate byte across V1 + + BLT cmp16 // Jump to the small string case if it's <32 bytes. + + CMP R4,$64,CR1 + MOVD $16,R11 + MOVD R3,R8 + BLT CR1,cmp32 // Special case for length 32 - 63 + MOVD $32,R12 + MOVD $48,R6 + + RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63 + ADD R3,R9,R9 // R9 = &s[len &^ 63] + ANDCC $63,R4 // (len &= 63) cmp 0. + + PCALIGN $16 +loop64: + LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0] + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // Match found at R8, jump out + + LXVD2X (R11)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // Match found at R8+16 bytes, jump out + + LXVD2X (R12)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat2 // Match found at R8+32 bytes, jump out + + LXVD2X (R6)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat3 // Match found at R8+48 bytes, jump out + + ADD $64,R8 + CMPU R8,R9,CR1 + BNE CR1,loop64 // R8 != &s[len &^ 63]? + + PCALIGN $32 + BEQ notfound // Is tail length 0? CR0 is set before entering loop64. + + CMP R4,$32 // Tail length >= 32, use cmp32 path. + CMP R4,$16,CR1 + BGE cmp32 + + ADD R8,R4,R9 + ADD $-16,R9 + BLE CR1,cmp64_tail_gt0 + +cmp64_tail_gt16: // Tail length 17 - 32 + LXVD2X (R0)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 + +cmp64_tail_gt0: // Tail length 1 - 16 + MOVD R9,R8 + LXVD2X (R0)(R9),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 + + BR notfound + +cmp32: // Length 32 - 63 + + // Bytes 0 - 15 + LXVD2X (R0)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 + + // Bytes 16 - 31 + LXVD2X (R8)(R11),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // Match found at R8+16 bytes, jump out + + BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32) + CMP R4,$48 + + ADD R4,R8,R9 // Compute &s[len(s)-16] + ADD $32,R8,R8 + ADD $-16,R9,R9 + ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8 + + // Bytes 33 - 47 + LXVD2X (R0)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // match found at R8+32 bytes, jump out + + BLE notfound + + // Bytes 48 - 63 + MOVD R9,R8 // R9 holds the final check. + LXVD2X (R0)(R9),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // Match found at R8+48 bytes, jump out + + BR notfound + +// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW. +#ifndef GOPPC64_power9 +#define ADJUST_FOR_CNTLZW -16 +#else +#define ADJUST_FOR_CNTLZW 0 +#endif + +// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used +// to determine the offset into the 16B vector, it will overcount by 16. Account for it here. +foundat3: + SUB R3,R8,R3 + ADD $48+ADJUST_FOR_CNTLZW,R3 + BR vfound +foundat2: + SUB R3,R8,R3 + ADD $32+ADJUST_FOR_CNTLZW,R3 + BR vfound +foundat1: + SUB R3,R8,R3 + ADD $16+ADJUST_FOR_CNTLZW,R3 + BR vfound +foundat0: + SUB R3,R8,R3 + ADD $0+ADJUST_FOR_CNTLZW,R3 +vfound: + // Map equal values into a 16 bit value with earlier matches setting higher bits. +#ifndef GOPPC64_power9 + VBPERMQ V6,V0,V6 + MFVRD V6,R4 + CNTLZW R4,R4 +#else +#ifdef GOARCH_ppc64le + // Put the value back into LE ordering by swapping doublewords. + XXPERMDI V6,V6,$2,V6 +#endif + _VCZBEBB V6,R4 +#endif + ADD R3,R4,R3 + RET + +cmp16: // Length 16 - 31 + CMPU R4,$16 + ADD R4,R3,R9 + BLT cmp8 + + ADD $-16,R9,R9 // &s[len(s)-16] + + // Bytes 0 - 15 + LXVD2X (R0)(R3),V2 + VCMPEQUBCC V2,V1,V6 + MOVD R3,R8 + BNE CR6,foundat0 // Match found at R8+32 bytes, jump out + + BEQ notfound + + // Bytes 16 - 30 + MOVD R9,R8 // R9 holds the final check. + LXVD2X (R0)(R9),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // Match found at R8+48 bytes, jump out + + BR notfound + + +cmp8: // Length 8 - 15 +#ifdef GOPPC64_power10 + // Load all the bytes into a single VSR in BE order. + SLD $56,R4,R5 + LXVLL R3,R5,V2 + // Compare and count the number which don't match. + VCMPEQUB V2,V1,V6 + VCLZLSBB V6,R3 + // If count is the number of bytes, or more. No matches are found. + CMPU R3,R4 + MOVD $-1,R5 + // Otherwise, the count is the index of the first match. + ISEL CR0LT,R3,R5,R3 + RET +#else + RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. + RLDIMI $16,R5,$32,R5 + RLDIMI $32,R5,$0,R5 + CMPU R4,$8 + BLT cmp4 + MOVD $-8,R11 + ADD $-8,R4,R4 + + _LDBEX (R0)(R3),R10 + _LDBEX (R11)(R9),R11 + CMPB R10,R5,R10 + CMPB R11,R5,R11 + CMPU R10,$0 + CMPU R11,$0,CR1 + CNTLZD R10,R10 + CNTLZD R11,R11 + SRD $3,R10,R3 + SRD $3,R11,R11 + BNE found + + ADD R4,R11,R4 + MOVD $-1,R3 + ISEL CR1EQ,R3,R4,R3 + RET + +cmp4: // Length 4 - 7 + CMPU R4,$4 + BLT cmp2 + MOVD $-4,R11 + ADD $-4,R4,R4 + + _LWBEX (R0)(R3),R10 + _LWBEX (R11)(R9),R11 + CMPB R10,R5,R10 + CMPB R11,R5,R11 + CNTLZW R10,R10 + CNTLZW R11,R11 + CMPU R10,$32 + CMPU R11,$32,CR1 + SRD $3,R10,R3 + SRD $3,R11,R11 + BNE found + + ADD R4,R11,R4 + MOVD $-1,R3 + ISEL CR1EQ,R3,R4,R3 + RET + +cmp2: // Length 2 - 3 + CMPU R4,$2 + BLT cmp1 + + _LHBEX (R0)(R3),R10 + CMPB R10,R5,R10 + SLDCC $48,R10,R10 + CNTLZD R10,R10 + SRD $3,R10,R3 + BNE found + +cmp1: // Length 1 + MOVD $-1,R3 + ANDCC $1,R4,R31 + BEQ found + + MOVBZ -1(R9),R10 + CMPB R10,R5,R10 + ANDCC $1,R10 + ADD $-1,R4 + ISEL CR0EQ,R3,R4,R3 + +found: + RET +#endif + +notfound: + MOVD $-1,R3 + RET + |