diff options
Diffstat (limited to 'src/internal/bytealg/equal_amd64.s')
-rw-r--r-- | src/internal/bytealg/equal_amd64.s | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s new file mode 100644 index 0000000..c816409 --- /dev/null +++ b/src/internal/bytealg/equal_amd64.s @@ -0,0 +1,154 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB),NOSPLIT,$0-25 + MOVQ a+0(FP), SI + MOVQ b+8(FP), DI + CMPQ SI, DI + JEQ eq + MOVQ size+16(FP), BX + LEAQ ret+24(FP), AX + JMP memeqbody<>(SB) +eq: + MOVB $1, ret+24(FP) + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17 + MOVQ a+0(FP), SI + MOVQ b+8(FP), DI + CMPQ SI, DI + JEQ eq + MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure + LEAQ ret+16(FP), AX + JMP memeqbody<>(SB) +eq: + MOVB $1, ret+16(FP) + RET + +// a in SI +// b in DI +// count in BX +// address of result byte in AX +TEXT memeqbody<>(SB),NOSPLIT,$0-0 + CMPQ BX, $8 + JB small + CMPQ BX, $64 + JB bigloop + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JE hugeloop_avx2 + + // 64 bytes at a time using xmm registers +hugeloop: + CMPQ BX, $64 + JB bigloop + MOVOU (SI), X0 + MOVOU (DI), X1 + MOVOU 16(SI), X2 + MOVOU 16(DI), X3 + MOVOU 32(SI), X4 + MOVOU 32(DI), X5 + MOVOU 48(SI), X6 + MOVOU 48(DI), X7 + PCMPEQB X1, X0 + PCMPEQB X3, X2 + PCMPEQB X5, X4 + PCMPEQB X7, X6 + PAND X2, X0 + PAND X6, X4 + PAND X4, X0 + PMOVMSKB X0, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffff + JEQ hugeloop + MOVB $0, (AX) + RET + + // 64 bytes at a time using ymm registers +hugeloop_avx2: + CMPQ BX, $64 + JB bigloop_avx2 + VMOVDQU (SI), Y0 + VMOVDQU (DI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU 32(DI), Y3 + VPCMPEQB Y1, Y0, Y4 + VPCMPEQB Y2, Y3, Y5 + VPAND Y4, Y5, Y6 + VPMOVMSKB Y6, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffffffff + JEQ hugeloop_avx2 + VZEROUPPER + MOVB $0, (AX) + RET + +bigloop_avx2: + VZEROUPPER + + // 8 bytes at a time using 64-bit register +bigloop: + CMPQ BX, $8 + JBE leftover + MOVQ (SI), CX + MOVQ (DI), DX + ADDQ $8, SI + ADDQ $8, DI + SUBQ $8, BX + CMPQ CX, DX + JEQ bigloop + MOVB $0, (AX) + RET + + // remaining 0-8 bytes +leftover: + MOVQ -8(SI)(BX*1), CX + MOVQ -8(DI)(BX*1), DX + CMPQ CX, DX + SETEQ (AX) + RET + +small: + CMPQ BX, $0 + JEQ equal + + LEAQ 0(BX*8), CX + NEGQ CX + + CMPB SI, $0xf8 + JA si_high + + // load at SI won't cross a page boundary. + MOVQ (SI), SI + JMP si_finish +si_high: + // address ends in 11111xxx. Load up to bytes we want, move to correct position. + MOVQ -8(SI)(BX*1), SI + SHRQ CX, SI +si_finish: + + // same for DI. + CMPB DI, $0xf8 + JA di_high + MOVQ (DI), DI + JMP di_finish +di_high: + MOVQ -8(DI)(BX*1), DI + SHRQ CX, DI +di_finish: + + SUBQ SI, DI + SHLQ CX, DI +equal: + SETEQ (AX) + RET + |