diff options
Diffstat (limited to 'src/internal/bytealg/index_amd64.s')
-rw-r--r-- | src/internal/bytealg/index_amd64.s | 274 |
1 files changed, 274 insertions, 0 deletions
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s new file mode 100644 index 0000000..6193b57 --- /dev/null +++ b/src/internal/bytealg/index_amd64.s @@ -0,0 +1,274 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Index(SB),NOSPLIT,$0-56 + MOVQ a_base+0(FP), DI + MOVQ a_len+8(FP), DX + MOVQ b_base+24(FP), R8 + MOVQ b_len+32(FP), AX + MOVQ DI, R10 + LEAQ ret+48(FP), R11 + JMP indexbody<>(SB) + +TEXT ·IndexString(SB),NOSPLIT,$0-40 + MOVQ a_base+0(FP), DI + MOVQ a_len+8(FP), DX + MOVQ b_base+16(FP), R8 + MOVQ b_len+24(FP), AX + MOVQ DI, R10 + LEAQ ret+32(FP), R11 + JMP indexbody<>(SB) + +// AX: length of string, that we are searching for +// DX: length of string, in which we are searching +// DI: pointer to string, in which we are searching +// R8: pointer to string, that we are searching for +// R11: address, where to put return value +// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them +TEXT indexbody<>(SB),NOSPLIT,$0 + CMPQ AX, DX + JA fail + CMPQ DX, $16 + JAE sse42 +no_sse42: + CMPQ AX, $2 + JA _3_or_more + MOVW (R8), R8 + LEAQ -1(DI)(DX*1), DX +loop2: + MOVW (DI), SI + CMPW SI,R8 + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop2 + JMP fail +_3_or_more: + CMPQ AX, $3 + JA _4_or_more + MOVW 1(R8), BX + MOVW (R8), R8 + LEAQ -2(DI)(DX*1), DX +loop3: + MOVW (DI), SI + CMPW SI,R8 + JZ partial_success3 + ADDQ $1,DI + CMPQ DI,DX + JB loop3 + JMP fail +partial_success3: + MOVW 1(DI), SI + CMPW SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop3 + JMP fail +_4_or_more: + CMPQ AX, $4 + JA _5_or_more + MOVL (R8), R8 + LEAQ -3(DI)(DX*1), DX +loop4: + MOVL (DI), SI + CMPL SI,R8 + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop4 + JMP fail +_5_or_more: + CMPQ AX, $7 + JA _8_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVL -4(R8)(AX*1), BX + MOVL (R8), R8 +loop5to7: + MOVL (DI), SI + CMPL SI,R8 + JZ partial_success5to7 + ADDQ $1,DI + CMPQ DI,DX + JB loop5to7 + JMP fail +partial_success5to7: + MOVL -4(AX)(DI*1), SI + CMPL SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop5to7 + JMP fail +_8_or_more: + CMPQ AX, $8 + JA _9_or_more + MOVQ (R8), R8 + LEAQ -7(DI)(DX*1), DX +loop8: + MOVQ (DI), SI + CMPQ SI,R8 + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop8 + JMP fail +_9_or_more: + CMPQ AX, $15 + JA _16_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVQ -8(R8)(AX*1), BX + MOVQ (R8), R8 +loop9to15: + MOVQ (DI), SI + CMPQ SI,R8 + JZ partial_success9to15 + ADDQ $1,DI + CMPQ DI,DX + JB loop9to15 + JMP fail +partial_success9to15: + MOVQ -8(AX)(DI*1), SI + CMPQ SI,BX + JZ success + ADDQ $1,DI + CMPQ DI,DX + JB loop9to15 + JMP fail +_16_or_more: + CMPQ AX, $16 + JA _17_or_more + MOVOU (R8), X1 + LEAQ -15(DI)(DX*1), DX +loop16: + MOVOU (DI), X2 + PCMPEQB X1, X2 + PMOVMSKB X2, SI + CMPQ SI, $0xffff + JE success + ADDQ $1,DI + CMPQ DI,DX + JB loop16 + JMP fail +_17_or_more: + CMPQ AX, $31 + JA _32_or_more + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + MOVOU -16(R8)(AX*1), X0 + MOVOU (R8), X1 +loop17to31: + MOVOU (DI), X2 + PCMPEQB X1,X2 + PMOVMSKB X2, SI + CMPQ SI, $0xffff + JE partial_success17to31 + ADDQ $1,DI + CMPQ DI,DX + JB loop17to31 + JMP fail +partial_success17to31: + MOVOU -16(AX)(DI*1), X3 + PCMPEQB X0, X3 + PMOVMSKB X3, SI + CMPQ SI, $0xffff + JE success + ADDQ $1,DI + CMPQ DI,DX + JB loop17to31 + JMP fail +// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63 +// So no need to check cpuid +_32_or_more: + CMPQ AX, $32 + JA _33_to_63 + VMOVDQU (R8), Y1 + LEAQ -31(DI)(DX*1), DX +loop32: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, SI + CMPL SI, $0xffffffff + JE success_avx2 + ADDQ $1,DI + CMPQ DI,DX + JB loop32 + JMP fail_avx2 +_33_to_63: + LEAQ 1(DI)(DX*1), DX + SUBQ AX, DX + VMOVDQU -32(R8)(AX*1), Y0 + VMOVDQU (R8), Y1 +loop33to63: + VMOVDQU (DI), Y2 + VPCMPEQB Y1, Y2, Y3 + VPMOVMSKB Y3, SI + CMPL SI, $0xffffffff + JE partial_success33to63 + ADDQ $1,DI + CMPQ DI,DX + JB loop33to63 + JMP fail_avx2 +partial_success33to63: + VMOVDQU -32(AX)(DI*1), Y3 + VPCMPEQB Y0, Y3, Y4 + VPMOVMSKB Y4, SI + CMPL SI, $0xffffffff + JE success_avx2 + ADDQ $1,DI + CMPQ DI,DX + JB loop33to63 +fail_avx2: + VZEROUPPER +fail: + MOVQ $-1, (R11) + RET +success_avx2: + VZEROUPPER + JMP success +sse42: + CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1 + JNE no_sse42 + CMPQ AX, $12 + // PCMPESTRI is slower than normal compare, + // so using it makes sense only if we advance 4+ bytes per compare + // This value was determined experimentally and is the ~same + // on Nehalem (first with SSE42) and Haswell. + JAE _9_or_more + LEAQ 16(R8), SI + TESTW $0xff0, SI + JEQ no_sse42 + MOVOU (R8), X1 + LEAQ -15(DI)(DX*1), SI + MOVQ $16, R9 + SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9 +loop_sse42: + // 0x0c means: unsigned byte compare (bits 0,1 are 00) + // for equality (bits 2,3 are 11) + // result is not masked or inverted (bits 4,5 are 00) + // and corresponds to first matching byte (bit 6 is 0) + PCMPESTRI $0x0c, (DI), X1 + // CX == 16 means no match, + // CX > R9 means partial match at the end of the string, + // otherwise sep is at offset CX from X1 start + CMPQ CX, R9 + JBE sse42_success + ADDQ R9, DI + CMPQ DI, SI + JB loop_sse42 + PCMPESTRI $0x0c, -1(SI), X1 + CMPQ CX, R9 + JA fail + LEAQ -1(SI), DI +sse42_success: + ADDQ CX, DI +success: + SUBQ R10, DI + MOVQ DI, (R11) + RET |