summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/equal_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/internal/bytealg/equal_amd64.s')
-rw-r--r--src/internal/bytealg/equal_amd64.s162
1 files changed, 162 insertions, 0 deletions
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
new file mode 100644
index 0000000..d178a33
--- /dev/null
+++ b/src/internal/bytealg/equal_amd64.s
@@ -0,0 +1,162 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "asm_amd64.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
+ // AX = a (want in SI)
+ // BX = b (want in DI)
+ // CX = size (want in BX)
+ CMPQ AX, BX
+ JNE neq
+ MOVQ $1, AX // return 1
+ RET
+neq:
+ MOVQ AX, SI
+ MOVQ BX, DI
+ MOVQ CX, BX
+ JMP memeqbody<>(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+ // AX = a (want in SI)
+ // BX = b (want in DI)
+ // 8(DX) = size (want in BX)
+ CMPQ AX, BX
+ JNE neq
+ MOVQ $1, AX // return 1
+ RET
+neq:
+ MOVQ AX, SI
+ MOVQ BX, DI
+ MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
+ JMP memeqbody<>(SB)
+
+// Input:
+// a in SI
+// b in DI
+// count in BX
+// Output:
+// result in AX
+TEXT memeqbody<>(SB),NOSPLIT,$0-0
+ CMPQ BX, $8
+ JB small
+ CMPQ BX, $64
+ JB bigloop
+#ifndef hasAVX2
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+ JE hugeloop_avx2
+
+ // 64 bytes at a time using xmm registers
+hugeloop:
+ CMPQ BX, $64
+ JB bigloop
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ MOVOU 16(SI), X2
+ MOVOU 16(DI), X3
+ MOVOU 32(SI), X4
+ MOVOU 32(DI), X5
+ MOVOU 48(SI), X6
+ MOVOU 48(DI), X7
+ PCMPEQB X1, X0
+ PCMPEQB X3, X2
+ PCMPEQB X5, X4
+ PCMPEQB X7, X6
+ PAND X2, X0
+ PAND X6, X4
+ PAND X4, X0
+ PMOVMSKB X0, DX
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, BX
+ CMPL DX, $0xffff
+ JEQ hugeloop
+ XORQ AX, AX // return 0
+ RET
+#endif
+
+ // 64 bytes at a time using ymm registers
+hugeloop_avx2:
+ CMPQ BX, $64
+ JB bigloop_avx2
+ VMOVDQU (SI), Y0
+ VMOVDQU (DI), Y1
+ VMOVDQU 32(SI), Y2
+ VMOVDQU 32(DI), Y3
+ VPCMPEQB Y1, Y0, Y4
+ VPCMPEQB Y2, Y3, Y5
+ VPAND Y4, Y5, Y6
+ VPMOVMSKB Y6, DX
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, BX
+ CMPL DX, $0xffffffff
+ JEQ hugeloop_avx2
+ VZEROUPPER
+ XORQ AX, AX // return 0
+ RET
+
+bigloop_avx2:
+ VZEROUPPER
+
+ // 8 bytes at a time using 64-bit register
+bigloop:
+ CMPQ BX, $8
+ JBE leftover
+ MOVQ (SI), CX
+ MOVQ (DI), DX
+ ADDQ $8, SI
+ ADDQ $8, DI
+ SUBQ $8, BX
+ CMPQ CX, DX
+ JEQ bigloop
+ XORQ AX, AX // return 0
+ RET
+
+ // remaining 0-8 bytes
+leftover:
+ MOVQ -8(SI)(BX*1), CX
+ MOVQ -8(DI)(BX*1), DX
+ CMPQ CX, DX
+ SETEQ AX
+ RET
+
+small:
+ CMPQ BX, $0
+ JEQ equal
+
+ LEAQ 0(BX*8), CX
+ NEGQ CX
+
+ CMPB SI, $0xf8
+ JA si_high
+
+ // load at SI won't cross a page boundary.
+ MOVQ (SI), SI
+ JMP si_finish
+si_high:
+ // address ends in 11111xxx. Load up to bytes we want, move to correct position.
+ MOVQ -8(SI)(BX*1), SI
+ SHRQ CX, SI
+si_finish:
+
+ // same for DI.
+ CMPB DI, $0xf8
+ JA di_high
+ MOVQ (DI), DI
+ JMP di_finish
+di_high:
+ MOVQ -8(DI)(BX*1), DI
+ SHRQ CX, DI
+di_finish:
+
+ SUBQ SI, DI
+ SHLQ CX, DI
+equal:
+ SETEQ AX
+ RET