diff options
Diffstat (limited to '')
-rw-r--r-- | src/internal/bytealg/count_amd64.s | 201 |
1 files changed, 201 insertions, 0 deletions
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s new file mode 100644 index 0000000..fa864c4 --- /dev/null +++ b/src/internal/bytealg/count_amd64.s @@ -0,0 +1,201 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Count(SB),NOSPLIT,$0-40 + CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 + JEQ 2(PC) + JMP ·countGeneric(SB) + MOVQ b_base+0(FP), SI + MOVQ b_len+8(FP), BX + MOVB c+24(FP), AL + LEAQ ret+32(FP), R8 + JMP countbody<>(SB) + +TEXT ·CountString(SB),NOSPLIT,$0-32 + CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 + JEQ 2(PC) + JMP ·countGenericString(SB) + MOVQ s_base+0(FP), SI + MOVQ s_len+8(FP), BX + MOVB c+16(FP), AL + LEAQ ret+24(FP), R8 + JMP countbody<>(SB) + +// input: +// SI: data +// BX: data len +// AL: byte sought +// R8: address to put result +// This function requires the POPCNT instruction. +TEXT countbody<>(SB),NOSPLIT,$0 + // Shuffle X0 around so that each byte contains + // the character we're looking for. + MOVD AX, X0 + PUNPCKLBW X0, X0 + PUNPCKLBW X0, X0 + PSHUFL $0, X0, X0 + + CMPQ BX, $16 + JLT small + + MOVQ $0, R12 // Accumulator + + MOVQ SI, DI + + CMPQ BX, $32 + JA avx2 +sse: + LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes + JMP sseloopentry + +sseloop: + // Move the next 16-byte chunk of the data into X1. + MOVOU (DI), X1 + // Compare bytes in X0 to X1. + PCMPEQB X0, X1 + // Take the top bit of each byte in X1 and put the result in DX. + PMOVMSKB X1, DX + // Count number of matching bytes + POPCNTL DX, DX + // Accumulate into R12 + ADDQ DX, R12 + // Advance to next block. + ADDQ $16, DI +sseloopentry: + CMPQ DI, AX + JBE sseloop + + // Get the number of bytes to consider in the last 16 bytes + ANDQ $15, BX + JZ end + + // Create mask to ignore overlap between previous 16 byte block + // and the next. + MOVQ $16,CX + SUBQ BX, CX + MOVQ $0xFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + + // Process the last 16-byte chunk. This chunk may overlap with the + // chunks we've already searched so we need to mask part of it. + MOVOU (AX), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + ADDQ DX, R12 +end: + MOVQ R12, (R8) + RET + +// handle for lengths < 16 +small: + TESTQ BX, BX + JEQ endzero + + // Check if we'll load across a page boundary. + LEAQ 16(SI), AX + TESTW $0xff0, AX + JEQ endofpage + + // We must ignore high bytes as they aren't part of our slice. + // Create mask. + MOVB BX, CX + MOVQ $1, R10 + SALQ CL, R10 + SUBQ $1, R10 + + // Load data + MOVOU (SI), X1 + // Compare target byte with each byte in data. + PCMPEQB X0, X1 + // Move result bits to integer register. + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + // Directly return DX, we don't need to accumulate + // since we have <16 bytes. + MOVQ DX, (R8) + RET +endzero: + MOVQ $0, (R8) + RET + +endofpage: + // We must ignore low bytes as they aren't part of our slice. + MOVQ $16,CX + SUBQ BX, CX + MOVQ $0xFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + + // Load data into the high end of X1. + MOVOU -16(SI)(BX*1), X1 + // Compare target byte with each byte in data. + PCMPEQB X0, X1 + // Move result bits to integer register. + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + // Directly return DX, we don't need to accumulate + // since we have <16 bytes. + POPCNTL DX, DX + MOVQ DX, (R8) + RET + +avx2: + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JNE sse + MOVD AX, X0 + LEAQ -32(SI)(BX*1), R11 + VPBROADCASTB X0, Y1 +avx2_loop: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, DX + POPCNTL DX, DX + ADDQ DX, R12 + ADDQ $32, DI + CMPQ DI, R11 + JLE avx2_loop + + // If last block is already processed, + // skip to the end. + CMPQ DI, R11 + JEQ endavx + + // Load address of the last 32 bytes. + // There is an overlap with the previous block. + MOVQ R11, DI + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, DX + // Exit AVX mode. + VZEROUPPER + + // Create mask to ignore overlap between previous 32 byte block + // and the next. + ANDQ $31, BX + MOVQ $32,CX + SUBQ BX, CX + MOVQ $0xFFFFFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + ADDQ DX, R12 + MOVQ R12, (R8) + RET +endavx: + // Exit AVX mode. + VZEROUPPER + MOVQ R12, (R8) + RET |