1 files changed, 154 insertions, 0 deletions
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s
new file mode 100644
index 0000000..55e02ce
--- /dev/null
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -0,0 +1,154 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64le || ppc64
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
+	// R3 = byte array pointer
+	// R4 = length
+	// R6 = byte to count
+	MTVRD	R6, V1		// move compare byte
+	MOVD	R6, R5
+	VSPLTB	$7, V1, V1	// replicate byte across V1
+	BR	countbytebody<>(SB)
+
+TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
+	// R3 = byte array pointer
+	// R4 = length
+	// R5 = byte to count
+	MTVRD	R5, V1		// move compare byte
+	VSPLTB	$7, V1, V1	// replicate byte across V1
+	BR	countbytebody<>(SB)
+
+// R3: addr of string
+// R4: len of string
+// R5: byte to count
+// V1: byte to count, splatted.
+// On exit:
+// R3: return value
+TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
+	MOVD	$0, R18 // byte count
+
+#ifndef GOPPC64_power10
+	RLDIMI	$8, R5, $48, R5
+	RLDIMI	$16, R5, $32, R5
+	RLDIMI	$32, R5, $0, R5	// fill reg with the byte to count
+#endif
+
+	CMPU	R4, $32		// Check if it's a small string (<32 bytes)
+	BLT	tail		// Jump to the small string case
+	SRD	$5, R4, R20
+	MOVD	R20, CTR
+	MOVD	$16, R21
+	XXLXOR	V4, V4, V4
+	XXLXOR	V5, V5, V5
+
+	PCALIGN	$16
+cmploop:
+	LXVD2X	(R0)(R3), V0	// Count 32B per loop with two vector accumulators.
+	LXVD2X	(R21)(R3), V2
+	VCMPEQUB V2, V1, V2
+	VCMPEQUB V0, V1, V0
+	VPOPCNTD V2, V2		// A match is 0xFF or 0. Count the bits into doubleword buckets.
+	VPOPCNTD V0, V0
+	VADDUDM	V0, V4, V4	// Accumulate the popcounts. They are 8x the count.
+	VADDUDM	V2, V5, V5	// The count will be fixed up afterwards.
+	ADD	$32, R3
+	BDNZ	cmploop
+
+	VADDUDM	V4, V5, V5
+	MFVSRD	V5, R18
+	VSLDOI	$8, V5, V5, V5
+	MFVSRD	V5, R21
+	ADD	R21, R18, R18
+	ANDCC	$31, R4, R4
+	// Skip the tail processing if no bytes remaining.
+	BEQ	tail_0
+
+#ifdef GOPPC64_power10
+	SRD	$3, R18, R18	// Fix the vector loop count before counting the tail on P10.
+
+tail:	// Count the last 0 - 31 bytes.
+	CMP	R4, $16
+	BLE	small_tail_p10
+	LXV	0(R3), V0
+	VCMPEQUB V0, V1, V0
+	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
+	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
+	ADD	R14, R18, R18
+	ADD	$16, R3, R3
+	ANDCC	$15, R4, R4
+
+small_tail_p10:
+	SLD	$56, R4, R6
+	LXVLL	R3, R6, V0
+	VCMPEQUB V0, V1, V0
+	VCLRRB	V0, R4, V0	// If <16B being compared, clear matches of the 16-R4 bytes.
+	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
+	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
+	ADD	R14, R18, R3
+	RET
+
+#else
+tail:	// Count the last 0 - 31 bytes.
+	CMP	R4, $16
+	BLT	tail_8
+	MOVD	(R3), R12
+	MOVD	8(R3), R14
+	CMPB	R12, R5, R12
+	CMPB	R14, R5, R14
+	POPCNTD	R12, R12
+	POPCNTD	R14, R14
+	ADD	R12, R18, R18
+	ADD	R14, R18, R18
+	ADD	$16, R3, R3
+	ADD	$-16, R4, R4
+
+tail_8:	// Count the remaining 0 - 15 bytes.
+	CMP	R4, $8
+	BLT	tail_4
+	MOVD	(R3), R12
+	CMPB	R12, R5, R12
+	POPCNTD	R12, R12
+	ADD	R12, R18, R18
+	ADD	$8, R3, R3
+	ADD	$-8, R4, R4
+
+tail_4:	// Count the remaining 0 - 7 bytes.
+	CMP	R4, $4
+	BLT	tail_2
+	MOVWZ	(R3), R12
+	CMPB	R12, R5, R12
+	SLD	$32, R12, R12	// Remove non-participating matches.
+	POPCNTD	R12, R12
+	ADD	R12, R18, R18
+	ADD	$4, R3, R3
+	ADD	$-4, R4, R4
+
+tail_2:	// Count the remaining 0 - 3 bytes.
+	CMP	R4, $2
+	BLT	tail_1
+	MOVHZ	(R3), R12
+	CMPB	R12, R5, R12
+	SLD	$48, R12, R12	// Remove non-participating matches.
+	POPCNTD	R12, R12
+	ADD	R12, R18, R18
+	ADD	$2, R3, R3
+	ADD	$-2, R4, R4
+
+tail_1:	// Count the remaining 0 - 1 bytes.
+	CMP	R4, $1
+	BLT	tail_0
+	MOVBZ	(R3), R12
+	CMPB	R12, R5, R12
+	ANDCC	$0x8, R12, R12
+	ADD	R12, R18, R18
+#endif
+
+tail_0:	// No remaining tail to count.
+	SRD	$3, R18, R3	// Fixup count, it is off by 8x.
+	RET