summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/count_ppc64x.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/internal/bytealg/count_ppc64x.s')
-rw-r--r--src/internal/bytealg/count_ppc64x.s154
1 files changed, 154 insertions, 0 deletions
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s
new file mode 100644
index 0000000..55e02ce
--- /dev/null
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -0,0 +1,154 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64le || ppc64
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+ // R3 = byte array pointer
+ // R4 = length
+ // R6 = byte to count
+ MTVRD R6, V1 // move compare byte
+ MOVD R6, R5
+ VSPLTB $7, V1, V1 // replicate byte across V1
+ BR countbytebody<>(SB)
+
+TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
+ // R3 = byte array pointer
+ // R4 = length
+ // R5 = byte to count
+ MTVRD R5, V1 // move compare byte
+ VSPLTB $7, V1, V1 // replicate byte across V1
+ BR countbytebody<>(SB)
+
+// R3: addr of string
+// R4: len of string
+// R5: byte to count
+// V1: byte to count, splatted.
+// On exit:
+// R3: return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+ MOVD $0, R18 // byte count
+
+#ifndef GOPPC64_power10
+ RLDIMI $8, R5, $48, R5
+ RLDIMI $16, R5, $32, R5
+ RLDIMI $32, R5, $0, R5 // fill reg with the byte to count
+#endif
+
+ CMPU R4, $32 // Check if it's a small string (<32 bytes)
+ BLT tail // Jump to the small string case
+ SRD $5, R4, R20
+ MOVD R20, CTR
+ MOVD $16, R21
+ XXLXOR V4, V4, V4
+ XXLXOR V5, V5, V5
+
+ PCALIGN $16
+cmploop:
+ LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators.
+ LXVD2X (R21)(R3), V2
+ VCMPEQUB V2, V1, V2
+ VCMPEQUB V0, V1, V0
+ VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets.
+ VPOPCNTD V0, V0
+ VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count.
+ VADDUDM V2, V5, V5 // The count will be fixed up afterwards.
+ ADD $32, R3
+ BDNZ cmploop
+
+ VADDUDM V4, V5, V5
+ MFVSRD V5, R18
+ VSLDOI $8, V5, V5, V5
+ MFVSRD V5, R21
+ ADD R21, R18, R18
+ ANDCC $31, R4, R4
+ // Skip the tail processing if no bytes remaining.
+ BEQ tail_0
+
+#ifdef GOPPC64_power10
+ SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10.
+
+tail: // Count the last 0 - 31 bytes.
+ CMP R4, $16
+ BLE small_tail_p10
+ LXV 0(R3), V0
+ VCMPEQUB V0, V1, V0
+ VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
+ SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
+ ADD R14, R18, R18
+ ADD $16, R3, R3
+ ANDCC $15, R4, R4
+
+small_tail_p10:
+ SLD $56, R4, R6
+ LXVLL R3, R6, V0
+ VCMPEQUB V0, V1, V0
+ VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes.
+ VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
+ SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
+ ADD R14, R18, R3
+ RET
+
+#else
+tail: // Count the last 0 - 31 bytes.
+ CMP R4, $16
+ BLT tail_8
+ MOVD (R3), R12
+ MOVD 8(R3), R14
+ CMPB R12, R5, R12
+ CMPB R14, R5, R14
+ POPCNTD R12, R12
+ POPCNTD R14, R14
+ ADD R12, R18, R18
+ ADD R14, R18, R18
+ ADD $16, R3, R3
+ ADD $-16, R4, R4
+
+tail_8: // Count the remaining 0 - 15 bytes.
+ CMP R4, $8
+ BLT tail_4
+ MOVD (R3), R12
+ CMPB R12, R5, R12
+ POPCNTD R12, R12
+ ADD R12, R18, R18
+ ADD $8, R3, R3
+ ADD $-8, R4, R4
+
+tail_4: // Count the remaining 0 - 7 bytes.
+ CMP R4, $4
+ BLT tail_2
+ MOVWZ (R3), R12
+ CMPB R12, R5, R12
+ SLD $32, R12, R12 // Remove non-participating matches.
+ POPCNTD R12, R12
+ ADD R12, R18, R18
+ ADD $4, R3, R3
+ ADD $-4, R4, R4
+
+tail_2: // Count the remaining 0 - 3 bytes.
+ CMP R4, $2
+ BLT tail_1
+ MOVHZ (R3), R12
+ CMPB R12, R5, R12
+ SLD $48, R12, R12 // Remove non-participating matches.
+ POPCNTD R12, R12
+ ADD R12, R18, R18
+ ADD $2, R3, R3
+ ADD $-2, R4, R4
+
+tail_1: // Count the remaining 0 - 1 bytes.
+ CMP R4, $1
+ BLT tail_0
+ MOVBZ (R3), R12
+ CMPB R12, R5, R12
+ ANDCC $0x8, R12, R12
+ ADD R12, R18, R18
+#endif
+
+tail_0: // No remaining tail to count.
+ SRD $3, R18, R3 // Fixup count, it is off by 8x.
+ RET