diff options
Diffstat (limited to 'src/internal/bytealg/indexbyte_amd64.s')
-rw-r--r-- | src/internal/bytealg/indexbyte_amd64.s | 149 |
1 files changed, 149 insertions, 0 deletions
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s new file mode 100644 index 0000000..1ca70e3 --- /dev/null +++ b/src/internal/bytealg/indexbyte_amd64.s @@ -0,0 +1,149 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·IndexByte(SB), NOSPLIT, $0-40 + MOVQ b_base+0(FP), SI + MOVQ b_len+8(FP), BX + MOVB c+24(FP), AL + LEAQ ret+32(FP), R8 + JMP indexbytebody<>(SB) + +TEXT ·IndexByteString(SB), NOSPLIT, $0-32 + MOVQ s_base+0(FP), SI + MOVQ s_len+8(FP), BX + MOVB c+16(FP), AL + LEAQ ret+24(FP), R8 + JMP indexbytebody<>(SB) + +// input: +// SI: data +// BX: data len +// AL: byte sought +// R8: address to put result +TEXT indexbytebody<>(SB), NOSPLIT, $0 + // Shuffle X0 around so that each byte contains + // the character we're looking for. + MOVD AX, X0 + PUNPCKLBW X0, X0 + PUNPCKLBW X0, X0 + PSHUFL $0, X0, X0 + + CMPQ BX, $16 + JLT small + + MOVQ SI, DI + + CMPQ BX, $32 + JA avx2 +sse: + LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes + JMP sseloopentry + +sseloop: + // Move the next 16-byte chunk of the data into X1. + MOVOU (DI), X1 + // Compare bytes in X0 to X1. + PCMPEQB X0, X1 + // Take the top bit of each byte in X1 and put the result in DX. + PMOVMSKB X1, DX + // Find first set bit, if any. + BSFL DX, DX + JNZ ssesuccess + // Advance to next block. + ADDQ $16, DI +sseloopentry: + CMPQ DI, AX + JB sseloop + + // Search the last 16-byte chunk. This chunk may overlap with the + // chunks we've already searched, but that's ok. + MOVQ AX, DI + MOVOU (AX), X1 + PCMPEQB X0, X1 + PMOVMSKB X1, DX + BSFL DX, DX + JNZ ssesuccess + +failure: + MOVQ $-1, (R8) + RET + +// We've found a chunk containing the byte. +// The chunk was loaded from DI. +// The index of the matching byte in the chunk is DX. +// The start of the data is SI. +ssesuccess: + SUBQ SI, DI // Compute offset of chunk within data. + ADDQ DX, DI // Add offset of byte within chunk. + MOVQ DI, (R8) + RET + +// handle for lengths < 16 +small: + TESTQ BX, BX + JEQ failure + + // Check if we'll load across a page boundary. + LEAQ 16(SI), AX + TESTW $0xff0, AX + JEQ endofpage + + MOVOU (SI), X1 // Load data + PCMPEQB X0, X1 // Compare target byte with each byte in data. + PMOVMSKB X1, DX // Move result bits to integer register. + BSFL DX, DX // Find first set bit. + JZ failure // No set bit, failure. + CMPL DX, BX + JAE failure // Match is past end of data. + MOVQ DX, (R8) + RET + +endofpage: + MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1. + PCMPEQB X0, X1 // Compare target byte with each byte in data. + PMOVMSKB X1, DX // Move result bits to integer register. + MOVL BX, CX + SHLL CX, DX + SHRL $16, DX // Shift desired bits down to bottom of register. + BSFL DX, DX // Find first set bit. + JZ failure // No set bit, failure. + MOVQ DX, (R8) + RET + +avx2: +#ifndef hasAVX2 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JNE sse +#endif + MOVD AX, X0 + LEAQ -32(SI)(BX*1), R11 + VPBROADCASTB X0, Y1 +avx2_loop: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPTEST Y3, Y3 + JNZ avx2success + ADDQ $32, DI + CMPQ DI, R11 + JLT avx2_loop + MOVQ R11, DI + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPTEST Y3, Y3 + JNZ avx2success + VZEROUPPER + MOVQ $-1, (R8) + RET + +avx2success: + VPMOVMSKB Y3, DX + BSFL DX, DX + SUBQ SI, DI + ADDQ DI, DX + MOVQ DX, (R8) + VZEROUPPER + RET |