diff options
Diffstat (limited to 'src/internal/bytealg/count_amd64.s')
-rw-r--r-- | src/internal/bytealg/count_amd64.s | 229 |
1 files changed, 229 insertions, 0 deletions
diff --git a/src/internal/bytealg/count_amd64.s b/src/internal/bytealg/count_amd64.s new file mode 100644 index 0000000..3a8dc36 --- /dev/null +++ b/src/internal/bytealg/count_amd64.s @@ -0,0 +1,229 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "asm_amd64.h" +#include "textflag.h" + +TEXT ·Count(SB),NOSPLIT,$0-40 +#ifndef hasPOPCNT + CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 + JEQ 2(PC) + JMP ·countGeneric(SB) +#endif + MOVQ b_base+0(FP), SI + MOVQ b_len+8(FP), BX + MOVB c+24(FP), AL + LEAQ ret+32(FP), R8 + JMP countbody<>(SB) + +TEXT ·CountString(SB),NOSPLIT,$0-32 +#ifndef hasPOPCNT + CMPB internal∕cpu·X86+const_offsetX86HasPOPCNT(SB), $1 + JEQ 2(PC) + JMP ·countGenericString(SB) +#endif + MOVQ s_base+0(FP), SI + MOVQ s_len+8(FP), BX + MOVB c+16(FP), AL + LEAQ ret+24(FP), R8 + JMP countbody<>(SB) + +// input: +// SI: data +// BX: data len +// AL: byte sought +// R8: address to put result +// This function requires the POPCNT instruction. +TEXT countbody<>(SB),NOSPLIT,$0 + // Shuffle X0 around so that each byte contains + // the character we're looking for. + MOVD AX, X0 + PUNPCKLBW X0, X0 + PUNPCKLBW X0, X0 + PSHUFL $0, X0, X0 + + CMPQ BX, $16 + JLT small + + MOVQ $0, R12 // Accumulator + + MOVQ SI, DI + + CMPQ BX, $64 + JAE avx2 +sse: + LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes + JMP sseloopentry + + PCALIGN $16 +sseloop: + // Move the next 16-byte chunk of the data into X1. + MOVOU (DI), X1 + // Compare bytes in X0 to X1. + PCMPEQB X0, X1 + // Take the top bit of each byte in X1 and put the result in DX. + PMOVMSKB X1, DX + // Count number of matching bytes + POPCNTL DX, DX + // Accumulate into R12 + ADDQ DX, R12 + // Advance to next block. + ADDQ $16, DI +sseloopentry: + CMPQ DI, AX + JBE sseloop + + // Get the number of bytes to consider in the last 16 bytes + ANDQ $15, BX + JZ end + + // Create mask to ignore overlap between previous 16 byte block + // and the next. + MOVQ $16,CX + SUBQ BX, CX + MOVQ $0xFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + + // Process the last 16-byte chunk. This chunk may overlap with the + // chunks we've already searched so we need to mask part of it. + MOVOU (AX), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + ADDQ DX, R12 +end: + MOVQ R12, (R8) + RET + +// handle for lengths < 16 +small: + TESTQ BX, BX + JEQ endzero + + // Check if we'll load across a page boundary. + LEAQ 16(SI), AX + TESTW $0xff0, AX + JEQ endofpage + + // We must ignore high bytes as they aren't part of our slice. + // Create mask. + MOVB BX, CX + MOVQ $1, R10 + SALQ CL, R10 + SUBQ $1, R10 + + // Load data + MOVOU (SI), X1 + // Compare target byte with each byte in data. + PCMPEQB X0, X1 + // Move result bits to integer register. + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + POPCNTL DX, DX + // Directly return DX, we don't need to accumulate + // since we have <16 bytes. + MOVQ DX, (R8) + RET +endzero: + MOVQ $0, (R8) + RET + +endofpage: + // We must ignore low bytes as they aren't part of our slice. + MOVQ $16,CX + SUBQ BX, CX + MOVQ $0xFFFF, R10 + SARQ CL, R10 + SALQ CL, R10 + + // Load data into the high end of X1. + MOVOU -16(SI)(BX*1), X1 + // Compare target byte with each byte in data. + PCMPEQB X0, X1 + // Move result bits to integer register. + PMOVMSKB X1, DX + // Apply mask + ANDQ R10, DX + // Directly return DX, we don't need to accumulate + // since we have <16 bytes. + POPCNTL DX, DX + MOVQ DX, (R8) + RET + +avx2: +#ifndef hasAVX2 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JNE sse +#endif + MOVD AX, X0 + LEAQ -64(SI)(BX*1), R11 + LEAQ (SI)(BX*1), R13 + VPBROADCASTB X0, Y1 + PCALIGN $32 +avx2_loop: + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y4 + VPCMPEQB Y1, Y2, Y3 + VPCMPEQB Y1, Y4, Y5 + VPMOVMSKB Y3, DX + VPMOVMSKB Y5, CX + POPCNTL DX, DX + POPCNTL CX, CX + ADDQ DX, R12 + ADDQ CX, R12 + ADDQ $64, DI + CMPQ DI, R11 + JLE avx2_loop + + // If last block is already processed, + // skip to the end. + // + // This check is NOT an optimization; if the input length is a + // multiple of 64, we must not go through the last leg of the + // function because the bit shift count passed to SALQ below would + // be 64, which is outside of the 0-63 range supported by those + // instructions. + // + // Tests in the bytes and strings packages with input lengths that + // are multiples of 64 will break if this condition were removed. + CMPQ DI, R13 + JEQ endavx + + // Load address of the last 64 bytes. + // There is an overlap with the previous block. + MOVQ R11, DI + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y4 + VPCMPEQB Y1, Y2, Y3 + VPCMPEQB Y1, Y4, Y5 + VPMOVMSKB Y3, DX + VPMOVMSKB Y5, CX + // Exit AVX mode. + VZEROUPPER + SALQ $32, CX + ORQ CX, DX + + // Create mask to ignore overlap between previous 64 byte block + // and the next. + ANDQ $63, BX + MOVQ $64, CX + SUBQ BX, CX + MOVQ $0xFFFFFFFFFFFFFFFF, R10 + SALQ CL, R10 + // Apply mask + ANDQ R10, DX + POPCNTQ DX, DX + ADDQ DX, R12 + MOVQ R12, (R8) + RET +endavx: + // Exit AVX mode. + VZEROUPPER + MOVQ R12, (R8) + RET |