summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/indexbyte_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/internal/bytealg/indexbyte_amd64.s147
1 files changed, 147 insertions, 0 deletions
diff --git a/src/internal/bytealg/indexbyte_amd64.s b/src/internal/bytealg/indexbyte_amd64.s
new file mode 100644
index 0000000..f78093c
--- /dev/null
+++ b/src/internal/bytealg/indexbyte_amd64.s
@@ -0,0 +1,147 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·IndexByte(SB), NOSPLIT, $0-40
+ MOVQ b_base+0(FP), SI
+ MOVQ b_len+8(FP), BX
+ MOVB c+24(FP), AL
+ LEAQ ret+32(FP), R8
+ JMP indexbytebody<>(SB)
+
+TEXT ·IndexByteString(SB), NOSPLIT, $0-32
+ MOVQ s_base+0(FP), SI
+ MOVQ s_len+8(FP), BX
+ MOVB c+16(FP), AL
+ LEAQ ret+24(FP), R8
+ JMP indexbytebody<>(SB)
+
+// input:
+// SI: data
+// BX: data len
+// AL: byte sought
+// R8: address to put result
+TEXT indexbytebody<>(SB), NOSPLIT, $0
+ // Shuffle X0 around so that each byte contains
+ // the character we're looking for.
+ MOVD AX, X0
+ PUNPCKLBW X0, X0
+ PUNPCKLBW X0, X0
+ PSHUFL $0, X0, X0
+
+ CMPQ BX, $16
+ JLT small
+
+ MOVQ SI, DI
+
+ CMPQ BX, $32
+ JA avx2
+sse:
+ LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
+ JMP sseloopentry
+
+sseloop:
+ // Move the next 16-byte chunk of the data into X1.
+ MOVOU (DI), X1
+ // Compare bytes in X0 to X1.
+ PCMPEQB X0, X1
+ // Take the top bit of each byte in X1 and put the result in DX.
+ PMOVMSKB X1, DX
+ // Find first set bit, if any.
+ BSFL DX, DX
+ JNZ ssesuccess
+ // Advance to next block.
+ ADDQ $16, DI
+sseloopentry:
+ CMPQ DI, AX
+ JB sseloop
+
+ // Search the last 16-byte chunk. This chunk may overlap with the
+ // chunks we've already searched, but that's ok.
+ MOVQ AX, DI
+ MOVOU (AX), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, DX
+ BSFL DX, DX
+ JNZ ssesuccess
+
+failure:
+ MOVQ $-1, (R8)
+ RET
+
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+ SUBQ SI, DI // Compute offset of chunk within data.
+ ADDQ DX, DI // Add offset of byte within chunk.
+ MOVQ DI, (R8)
+ RET
+
+// handle for lengths < 16
+small:
+ TESTQ BX, BX
+ JEQ failure
+
+ // Check if we'll load across a page boundary.
+ LEAQ 16(SI), AX
+ TESTW $0xff0, AX
+ JEQ endofpage
+
+ MOVOU (SI), X1 // Load data
+ PCMPEQB X0, X1 // Compare target byte with each byte in data.
+ PMOVMSKB X1, DX // Move result bits to integer register.
+ BSFL DX, DX // Find first set bit.
+ JZ failure // No set bit, failure.
+ CMPL DX, BX
+ JAE failure // Match is past end of data.
+ MOVQ DX, (R8)
+ RET
+
+endofpage:
+ MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
+ PCMPEQB X0, X1 // Compare target byte with each byte in data.
+ PMOVMSKB X1, DX // Move result bits to integer register.
+ MOVL BX, CX
+ SHLL CX, DX
+ SHRL $16, DX // Shift desired bits down to bottom of register.
+ BSFL DX, DX // Find first set bit.
+ JZ failure // No set bit, failure.
+ MOVQ DX, (R8)
+ RET
+
+avx2:
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+ JNE sse
+ MOVD AX, X0
+ LEAQ -32(SI)(BX*1), R11
+ VPBROADCASTB X0, Y1
+avx2_loop:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPTEST Y3, Y3
+ JNZ avx2success
+ ADDQ $32, DI
+ CMPQ DI, R11
+ JLT avx2_loop
+ MOVQ R11, DI
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPTEST Y3, Y3
+ JNZ avx2success
+ VZEROUPPER
+ MOVQ $-1, (R8)
+ RET
+
+avx2success:
+ VPMOVMSKB Y3, DX
+ BSFL DX, DX
+ SUBQ SI, DI
+ ADDQ DI, DX
+ MOVQ DX, (R8)
+ VZEROUPPER
+ RET