diff options
Diffstat (limited to 'src/internal/bytealg/equal_amd64.s')
-rw-r--r-- | src/internal/bytealg/equal_amd64.s | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s new file mode 100644 index 0000000..79a0520 --- /dev/null +++ b/src/internal/bytealg/equal_amd64.s @@ -0,0 +1,165 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "asm_amd64.h" +#include "textflag.h" + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25 + // AX = a (want in SI) + // BX = b (want in DI) + // CX = size (want in BX) + CMPQ AX, BX + JNE neq + MOVQ $1, AX // return 1 + RET +neq: + MOVQ AX, SI + MOVQ BX, DI + MOVQ CX, BX + JMP memeqbody<>(SB) + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17 + // AX = a (want in SI) + // BX = b (want in DI) + // 8(DX) = size (want in BX) + CMPQ AX, BX + JNE neq + MOVQ $1, AX // return 1 + RET +neq: + MOVQ AX, SI + MOVQ BX, DI + MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure + JMP memeqbody<>(SB) + +// Input: +// a in SI +// b in DI +// count in BX +// Output: +// result in AX +TEXT memeqbody<>(SB),NOSPLIT,$0-0 + CMPQ BX, $8 + JB small + CMPQ BX, $64 + JB bigloop +#ifndef hasAVX2 + CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1 + JE hugeloop_avx2 + + // 64 bytes at a time using xmm registers + PCALIGN $16 +hugeloop: + CMPQ BX, $64 + JB bigloop + MOVOU (SI), X0 + MOVOU (DI), X1 + MOVOU 16(SI), X2 + MOVOU 16(DI), X3 + MOVOU 32(SI), X4 + MOVOU 32(DI), X5 + MOVOU 48(SI), X6 + MOVOU 48(DI), X7 + PCMPEQB X1, X0 + PCMPEQB X3, X2 + PCMPEQB X5, X4 + PCMPEQB X7, X6 + PAND X2, X0 + PAND X6, X4 + PAND X4, X0 + PMOVMSKB X0, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffff + JEQ hugeloop + XORQ AX, AX // return 0 + RET +#endif + + // 64 bytes at a time using ymm registers + PCALIGN $16 +hugeloop_avx2: + CMPQ BX, $64 + JB bigloop_avx2 + VMOVDQU (SI), Y0 + VMOVDQU (DI), Y1 + VMOVDQU 32(SI), Y2 + VMOVDQU 32(DI), Y3 + VPCMPEQB Y1, Y0, Y4 + VPCMPEQB Y2, Y3, Y5 + VPAND Y4, Y5, Y6 + VPMOVMSKB Y6, DX + ADDQ $64, SI + ADDQ $64, DI + SUBQ $64, BX + CMPL DX, $0xffffffff + JEQ hugeloop_avx2 + VZEROUPPER + XORQ AX, AX // return 0 + RET + +bigloop_avx2: + VZEROUPPER + + // 8 bytes at a time using 64-bit register + PCALIGN $16 +bigloop: + CMPQ BX, $8 + JBE leftover + MOVQ (SI), CX + MOVQ (DI), DX + ADDQ $8, SI + ADDQ $8, DI + SUBQ $8, BX + CMPQ CX, DX + JEQ bigloop + XORQ AX, AX // return 0 + RET + + // remaining 0-8 bytes +leftover: + MOVQ -8(SI)(BX*1), CX + MOVQ -8(DI)(BX*1), DX + CMPQ CX, DX + SETEQ AX + RET + +small: + CMPQ BX, $0 + JEQ equal + + LEAQ 0(BX*8), CX + NEGQ CX + + CMPB SI, $0xf8 + JA si_high + + // load at SI won't cross a page boundary. + MOVQ (SI), SI + JMP si_finish +si_high: + // address ends in 11111xxx. Load up to bytes we want, move to correct position. + MOVQ -8(SI)(BX*1), SI + SHRQ CX, SI +si_finish: + + // same for DI. + CMPB DI, $0xf8 + JA di_high + MOVQ (DI), DI + JMP di_finish +di_high: + MOVQ -8(DI)(BX*1), DI + SHRQ CX, DI +di_finish: + + SUBQ SI, DI + SHLQ CX, DI +equal: + SETEQ AX + RET |