summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/indexbyte_ppc64x.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/internal/bytealg/indexbyte_ppc64x.s')
-rw-r--r--src/internal/bytealg/indexbyte_ppc64x.s314
1 files changed, 314 insertions, 0 deletions
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s
new file mode 100644
index 0000000..b6714f4
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -0,0 +1,314 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // R3 = byte array pointer
+ // R4 = length
+ MOVD R6, R5 // R5 = byte
+ BR indexbytebody<>(SB)
+
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
+ // R3 = string
+ // R4 = length
+ // R5 = byte
+ BR indexbytebody<>(SB)
+
+#ifndef GOPPC64_power9
+#ifdef GOARCH_ppc64le
+DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
+DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
+#else
+DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
+DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
+#endif
+GLOBL indexbytevbperm<>+0(SB), RODATA, $16
+#endif
+
+// Some operations are endian specific, choose the correct opcode base on GOARCH.
+// Note, _VCZBEBB is only available on power9 and newer.
+#ifdef GOARCH_ppc64le
+#define _LDBEX MOVDBR
+#define _LWBEX MOVWBR
+#define _LHBEX MOVHBR
+#define _VCZBEBB VCTZLSBB
+#else
+#define _LDBEX MOVD
+#define _LWBEX MOVW
+#define _LHBEX MOVH
+#define _VCZBEBB VCLZLSBB
+#endif
+
+// R3 = addr of string
+// R4 = len of string
+// R5 = byte to find
+// On exit:
+// R3 = return value
+TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+ CMPU R4,$32
+
+#ifndef GOPPC64_power9
+ // Load VBPERMQ constant to reduce compare into an ordered bit mask.
+ MOVD $indexbytevbperm<>+00(SB),R16
+ LXVD2X (R16),V0 // Set up swap string
+#endif
+
+ MTVRD R5,V1
+ VSPLTB $7,V1,V1 // Replicate byte across V1
+
+ BLT cmp16 // Jump to the small string case if it's <32 bytes.
+
+ CMP R4,$64,CR1
+ MOVD $16,R11
+ MOVD R3,R8
+ BLT CR1,cmp32 // Special case for length 32 - 63
+ MOVD $32,R12
+ MOVD $48,R6
+
+ RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63
+ ADD R3,R9,R9 // R9 = &s[len &^ 63]
+ ANDCC $63,R4 // (len &= 63) cmp 0.
+
+ PCALIGN $16
+loop64:
+ LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0]
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // Match found at R8, jump out
+
+ LXVD2X (R11)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
+
+ LXVD2X (R12)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat2 // Match found at R8+32 bytes, jump out
+
+ LXVD2X (R6)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat3 // Match found at R8+48 bytes, jump out
+
+ ADD $64,R8
+ CMPU R8,R9,CR1
+ BNE CR1,loop64 // R8 != &s[len &^ 63]?
+
+ PCALIGN $32
+ BEQ notfound // Is tail length 0? CR0 is set before entering loop64.
+
+ CMP R4,$32 // Tail length >= 32, use cmp32 path.
+ CMP R4,$16,CR1
+ BGE cmp32
+
+ ADD R8,R4,R9
+ ADD $-16,R9
+ BLE CR1,cmp64_tail_gt0
+
+cmp64_tail_gt16: // Tail length 17 - 32
+ LXVD2X (R0)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0
+
+cmp64_tail_gt0: // Tail length 1 - 16
+ MOVD R9,R8
+ LXVD2X (R0)(R9),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0
+
+ BR notfound
+
+cmp32: // Length 32 - 63
+
+ // Bytes 0 - 15
+ LXVD2X (R0)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0
+
+ // Bytes 16 - 31
+ LXVD2X (R8)(R11),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
+
+ BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32)
+ CMP R4,$48
+
+ ADD R4,R8,R9 // Compute &s[len(s)-16]
+ ADD $32,R8,R8
+ ADD $-16,R9,R9
+ ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8
+
+ // Bytes 33 - 47
+ LXVD2X (R0)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // match found at R8+32 bytes, jump out
+
+ BLE notfound
+
+ // Bytes 48 - 63
+ MOVD R9,R8 // R9 holds the final check.
+ LXVD2X (R0)(R9),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
+
+ BR notfound
+
+// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
+#ifndef GOPPC64_power9
+#define ADJUST_FOR_CNTLZW -16
+#else
+#define ADJUST_FOR_CNTLZW 0
+#endif
+
+// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
+// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
+foundat3:
+ SUB R3,R8,R3
+ ADD $48+ADJUST_FOR_CNTLZW,R3
+ BR vfound
+foundat2:
+ SUB R3,R8,R3
+ ADD $32+ADJUST_FOR_CNTLZW,R3
+ BR vfound
+foundat1:
+ SUB R3,R8,R3
+ ADD $16+ADJUST_FOR_CNTLZW,R3
+ BR vfound
+foundat0:
+ SUB R3,R8,R3
+ ADD $0+ADJUST_FOR_CNTLZW,R3
+vfound:
+ // Map equal values into a 16 bit value with earlier matches setting higher bits.
+#ifndef GOPPC64_power9
+ VBPERMQ V6,V0,V6
+ MFVRD V6,R4
+ CNTLZW R4,R4
+#else
+#ifdef GOARCH_ppc64le
+ // Put the value back into LE ordering by swapping doublewords.
+ XXPERMDI V6,V6,$2,V6
+#endif
+ _VCZBEBB V6,R4
+#endif
+ ADD R3,R4,R3
+ RET
+
+cmp16: // Length 16 - 31
+ CMPU R4,$16
+ ADD R4,R3,R9
+ BLT cmp8
+
+ ADD $-16,R9,R9 // &s[len(s)-16]
+
+ // Bytes 0 - 15
+ LXVD2X (R0)(R3),V2
+ VCMPEQUBCC V2,V1,V6
+ MOVD R3,R8
+ BNE CR6,foundat0 // Match found at R8+32 bytes, jump out
+
+ BEQ notfound
+
+ // Bytes 16 - 30
+ MOVD R9,R8 // R9 holds the final check.
+ LXVD2X (R0)(R9),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
+
+ BR notfound
+
+
+cmp8: // Length 8 - 15
+#ifdef GOPPC64_power10
+ // Load all the bytes into a single VSR in BE order.
+ SLD $56,R4,R5
+ LXVLL R3,R5,V2
+ // Compare and count the number which don't match.
+ VCMPEQUB V2,V1,V6
+ VCLZLSBB V6,R3
+ // If count is the number of bytes, or more. No matches are found.
+ CMPU R3,R4
+ MOVD $-1,R5
+ // Otherwise, the count is the index of the first match.
+ ISEL CR0LT,R3,R5,R3
+ RET
+#else
+ RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
+ RLDIMI $16,R5,$32,R5
+ RLDIMI $32,R5,$0,R5
+ CMPU R4,$8
+ BLT cmp4
+ MOVD $-8,R11
+ ADD $-8,R4,R4
+
+ _LDBEX (R0)(R3),R10
+ _LDBEX (R11)(R9),R11
+ CMPB R10,R5,R10
+ CMPB R11,R5,R11
+ CMPU R10,$0
+ CMPU R11,$0,CR1
+ CNTLZD R10,R10
+ CNTLZD R11,R11
+ SRD $3,R10,R3
+ SRD $3,R11,R11
+ BNE found
+
+ ADD R4,R11,R4
+ MOVD $-1,R3
+ ISEL CR1EQ,R3,R4,R3
+ RET
+
+cmp4: // Length 4 - 7
+ CMPU R4,$4
+ BLT cmp2
+ MOVD $-4,R11
+ ADD $-4,R4,R4
+
+ _LWBEX (R0)(R3),R10
+ _LWBEX (R11)(R9),R11
+ CMPB R10,R5,R10
+ CMPB R11,R5,R11
+ CNTLZW R10,R10
+ CNTLZW R11,R11
+ CMPU R10,$32
+ CMPU R11,$32,CR1
+ SRD $3,R10,R3
+ SRD $3,R11,R11
+ BNE found
+
+ ADD R4,R11,R4
+ MOVD $-1,R3
+ ISEL CR1EQ,R3,R4,R3
+ RET
+
+cmp2: // Length 2 - 3
+ CMPU R4,$2
+ BLT cmp1
+
+ _LHBEX (R0)(R3),R10
+ CMPB R10,R5,R10
+ SLDCC $48,R10,R10
+ CNTLZD R10,R10
+ SRD $3,R10,R3
+ BNE found
+
+cmp1: // Length 1
+ MOVD $-1,R3
+ ANDCC $1,R4,R31
+ BEQ found
+
+ MOVBZ -1(R9),R10
+ CMPB R10,R5,R10
+ ANDCC $1,R10
+ ADD $-1,R4
+ ISEL CR0EQ,R3,R4,R3
+
+found:
+ RET
+#endif
+
+notfound:
+ MOVD $-1,R3
+ RET
+