diff options
Diffstat (limited to 'src/internal/bytealg/count_ppc64x.s')
-rw-r--r-- | src/internal/bytealg/count_ppc64x.s | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s new file mode 100644 index 0000000..55e02ce --- /dev/null +++ b/src/internal/bytealg/count_ppc64x.s @@ -0,0 +1,154 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ppc64le || ppc64 + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 + // R3 = byte array pointer + // R4 = length + // R6 = byte to count + MTVRD R6, V1 // move compare byte + MOVD R6, R5 + VSPLTB $7, V1, V1 // replicate byte across V1 + BR countbytebody<>(SB) + +TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32 + // R3 = byte array pointer + // R4 = length + // R5 = byte to count + MTVRD R5, V1 // move compare byte + VSPLTB $7, V1, V1 // replicate byte across V1 + BR countbytebody<>(SB) + +// R3: addr of string +// R4: len of string +// R5: byte to count +// V1: byte to count, splatted. +// On exit: +// R3: return value +TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 + MOVD $0, R18 // byte count + +#ifndef GOPPC64_power10 + RLDIMI $8, R5, $48, R5 + RLDIMI $16, R5, $32, R5 + RLDIMI $32, R5, $0, R5 // fill reg with the byte to count +#endif + + CMPU R4, $32 // Check if it's a small string (<32 bytes) + BLT tail // Jump to the small string case + SRD $5, R4, R20 + MOVD R20, CTR + MOVD $16, R21 + XXLXOR V4, V4, V4 + XXLXOR V5, V5, V5 + + PCALIGN $16 +cmploop: + LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators. + LXVD2X (R21)(R3), V2 + VCMPEQUB V2, V1, V2 + VCMPEQUB V0, V1, V0 + VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets. + VPOPCNTD V0, V0 + VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count. + VADDUDM V2, V5, V5 // The count will be fixed up afterwards. + ADD $32, R3 + BDNZ cmploop + + VADDUDM V4, V5, V5 + MFVSRD V5, R18 + VSLDOI $8, V5, V5, V5 + MFVSRD V5, R21 + ADD R21, R18, R18 + ANDCC $31, R4, R4 + // Skip the tail processing if no bytes remaining. + BEQ tail_0 + +#ifdef GOPPC64_power10 + SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10. + +tail: // Count the last 0 - 31 bytes. + CMP R4, $16 + BLE small_tail_p10 + LXV 0(R3), V0 + VCMPEQUB V0, V1, V0 + VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. + SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. + ADD R14, R18, R18 + ADD $16, R3, R3 + ANDCC $15, R4, R4 + +small_tail_p10: + SLD $56, R4, R6 + LXVLL R3, R6, V0 + VCMPEQUB V0, V1, V0 + VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes. + VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. + SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. + ADD R14, R18, R3 + RET + +#else +tail: // Count the last 0 - 31 bytes. + CMP R4, $16 + BLT tail_8 + MOVD (R3), R12 + MOVD 8(R3), R14 + CMPB R12, R5, R12 + CMPB R14, R5, R14 + POPCNTD R12, R12 + POPCNTD R14, R14 + ADD R12, R18, R18 + ADD R14, R18, R18 + ADD $16, R3, R3 + ADD $-16, R4, R4 + +tail_8: // Count the remaining 0 - 15 bytes. + CMP R4, $8 + BLT tail_4 + MOVD (R3), R12 + CMPB R12, R5, R12 + POPCNTD R12, R12 + ADD R12, R18, R18 + ADD $8, R3, R3 + ADD $-8, R4, R4 + +tail_4: // Count the remaining 0 - 7 bytes. + CMP R4, $4 + BLT tail_2 + MOVWZ (R3), R12 + CMPB R12, R5, R12 + SLD $32, R12, R12 // Remove non-participating matches. + POPCNTD R12, R12 + ADD R12, R18, R18 + ADD $4, R3, R3 + ADD $-4, R4, R4 + +tail_2: // Count the remaining 0 - 3 bytes. + CMP R4, $2 + BLT tail_1 + MOVHZ (R3), R12 + CMPB R12, R5, R12 + SLD $48, R12, R12 // Remove non-participating matches. + POPCNTD R12, R12 + ADD R12, R18, R18 + ADD $2, R3, R3 + ADD $-2, R4, R4 + +tail_1: // Count the remaining 0 - 1 bytes. + CMP R4, $1 + BLT tail_0 + MOVBZ (R3), R12 + CMPB R12, R5, R12 + ANDCC $0x8, R12, R12 + ADD R12, R18, R18 +#endif + +tail_0: // No remaining tail to count. + SRD $3, R18, R3 // Fixup count, it is off by 8x. + RET |