summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/index_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/internal/bytealg/index_amd64.s')
-rw-r--r--src/internal/bytealg/index_amd64.s276
1 files changed, 276 insertions, 0 deletions
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s
new file mode 100644
index 0000000..0431491
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,276 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+ MOVQ a_base+0(FP), DI
+ MOVQ a_len+8(FP), DX
+ MOVQ b_base+24(FP), R8
+ MOVQ b_len+32(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+48(FP), R11
+ JMP indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+ MOVQ a_base+0(FP), DI
+ MOVQ a_len+8(FP), DX
+ MOVQ b_base+16(FP), R8
+ MOVQ b_len+24(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+32(FP), R11
+ JMP indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// R8: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+ CMPQ AX, DX
+ JA fail
+ CMPQ DX, $16
+ JAE sse42
+no_sse42:
+ CMPQ AX, $2
+ JA _3_or_more
+ MOVW (R8), R8
+ LEAQ -1(DI)(DX*1), DX
+loop2:
+ MOVW (DI), SI
+ CMPW SI,R8
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop2
+ JMP fail
+_3_or_more:
+ CMPQ AX, $3
+ JA _4_or_more
+ MOVW 1(R8), BX
+ MOVW (R8), R8
+ LEAQ -2(DI)(DX*1), DX
+loop3:
+ MOVW (DI), SI
+ CMPW SI,R8
+ JZ partial_success3
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop3
+ JMP fail
+partial_success3:
+ MOVW 1(DI), SI
+ CMPW SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop3
+ JMP fail
+_4_or_more:
+ CMPQ AX, $4
+ JA _5_or_more
+ MOVL (R8), R8
+ LEAQ -3(DI)(DX*1), DX
+loop4:
+ MOVL (DI), SI
+ CMPL SI,R8
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop4
+ JMP fail
+_5_or_more:
+ CMPQ AX, $7
+ JA _8_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVL -4(R8)(AX*1), BX
+ MOVL (R8), R8
+loop5to7:
+ MOVL (DI), SI
+ CMPL SI,R8
+ JZ partial_success5to7
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop5to7
+ JMP fail
+partial_success5to7:
+ MOVL -4(AX)(DI*1), SI
+ CMPL SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop5to7
+ JMP fail
+_8_or_more:
+ CMPQ AX, $8
+ JA _9_or_more
+ MOVQ (R8), R8
+ LEAQ -7(DI)(DX*1), DX
+loop8:
+ MOVQ (DI), SI
+ CMPQ SI,R8
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop8
+ JMP fail
+_9_or_more:
+ CMPQ AX, $15
+ JA _16_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVQ -8(R8)(AX*1), BX
+ MOVQ (R8), R8
+loop9to15:
+ MOVQ (DI), SI
+ CMPQ SI,R8
+ JZ partial_success9to15
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop9to15
+ JMP fail
+partial_success9to15:
+ MOVQ -8(AX)(DI*1), SI
+ CMPQ SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop9to15
+ JMP fail
+_16_or_more:
+ CMPQ AX, $16
+ JA _17_or_more
+ MOVOU (R8), X1
+ LEAQ -15(DI)(DX*1), DX
+loop16:
+ MOVOU (DI), X2
+ PCMPEQB X1, X2
+ PMOVMSKB X2, SI
+ CMPQ SI, $0xffff
+ JE success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop16
+ JMP fail
+_17_or_more:
+ CMPQ AX, $31
+ JA _32_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVOU -16(R8)(AX*1), X0
+ MOVOU (R8), X1
+loop17to31:
+ MOVOU (DI), X2
+ PCMPEQB X1,X2
+ PMOVMSKB X2, SI
+ CMPQ SI, $0xffff
+ JE partial_success17to31
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop17to31
+ JMP fail
+partial_success17to31:
+ MOVOU -16(AX)(DI*1), X3
+ PCMPEQB X0, X3
+ PMOVMSKB X3, SI
+ CMPQ SI, $0xffff
+ JE success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop17to31
+ JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+ CMPQ AX, $32
+ JA _33_to_63
+ VMOVDQU (R8), Y1
+ LEAQ -31(DI)(DX*1), DX
+loop32:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop32
+ JMP fail_avx2
+_33_to_63:
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ VMOVDQU -32(R8)(AX*1), Y0
+ VMOVDQU (R8), Y1
+loop33to63:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE partial_success33to63
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+ JMP fail_avx2
+partial_success33to63:
+ VMOVDQU -32(AX)(DI*1), Y3
+ VPCMPEQB Y0, Y3, Y4
+ VPMOVMSKB Y4, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+fail_avx2:
+ VZEROUPPER
+fail:
+ MOVQ $-1, (R11)
+ RET
+success_avx2:
+ VZEROUPPER
+ JMP success
+sse42:
+#ifndef hasSSE42
+ CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
+ JNE no_sse42
+#endif
+ CMPQ AX, $12
+ // PCMPESTRI is slower than normal compare,
+ // so using it makes sense only if we advance 4+ bytes per compare
+ // This value was determined experimentally and is the ~same
+ // on Nehalem (first with SSE42) and Haswell.
+ JAE _9_or_more
+ LEAQ 16(R8), SI
+ TESTW $0xff0, SI
+ JEQ no_sse42
+ MOVOU (R8), X1
+ LEAQ -15(DI)(DX*1), SI
+ MOVQ $16, R9
+ SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+ // 0x0c means: unsigned byte compare (bits 0,1 are 00)
+ // for equality (bits 2,3 are 11)
+ // result is not masked or inverted (bits 4,5 are 00)
+ // and corresponds to first matching byte (bit 6 is 0)
+ PCMPESTRI $0x0c, (DI), X1
+ // CX == 16 means no match,
+ // CX > R9 means partial match at the end of the string,
+ // otherwise sep is at offset CX from X1 start
+ CMPQ CX, R9
+ JBE sse42_success
+ ADDQ R9, DI
+ CMPQ DI, SI
+ JB loop_sse42
+ PCMPESTRI $0x0c, -1(SI), X1
+ CMPQ CX, R9
+ JA fail
+ LEAQ -1(SI), DI
+sse42_success:
+ ADDQ CX, DI
+success:
+ SUBQ R10, DI
+ MOVQ DI, (R11)
+ RET