summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/compare_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/internal/bytealg/compare_amd64.s')
-rw-r--r--src/internal/bytealg/compare_amd64.s228
1 files changed, 228 insertions, 0 deletions
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
new file mode 100644
index 0000000..900b92a
--- /dev/null
+++ b/src/internal/bytealg/compare_amd64.s
@@ -0,0 +1,228 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Compare(SB),NOSPLIT,$0-56
+ MOVQ a_base+0(FP), SI
+ MOVQ a_len+8(FP), BX
+ MOVQ b_base+24(FP), DI
+ MOVQ b_len+32(FP), DX
+ LEAQ ret+48(FP), R9
+ JMP cmpbody<>(SB)
+
+TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+ MOVQ a_base+0(FP), SI
+ MOVQ a_len+8(FP), BX
+ MOVQ b_base+16(FP), DI
+ MOVQ b_len+24(FP), DX
+ LEAQ ret+32(FP), R9
+ JMP cmpbody<>(SB)
+
+// input:
+// SI = a
+// DI = b
+// BX = alen
+// DX = blen
+// R9 = address of output word (stores -1/0/1 here)
+TEXT cmpbody<>(SB),NOSPLIT,$0-0
+ CMPQ SI, DI
+ JEQ allsame
+ CMPQ BX, DX
+ MOVQ DX, R8
+ CMOVQLT BX, R8 // R8 = min(alen, blen) = # of bytes to compare
+ CMPQ R8, $8
+ JB small
+
+ CMPQ R8, $63
+ JBE loop
+ CMPB internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
+ JEQ big_loop_avx2
+ JMP big_loop
+loop:
+ CMPQ R8, $16
+ JBE _0through16
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX // convert EQ to NE
+ JNE diff16 // branch if at least one byte is not equal
+ ADDQ $16, SI
+ ADDQ $16, DI
+ SUBQ $16, R8
+ JMP loop
+
+diff64:
+ ADDQ $48, SI
+ ADDQ $48, DI
+ JMP diff16
+diff48:
+ ADDQ $32, SI
+ ADDQ $32, DI
+ JMP diff16
+diff32:
+ ADDQ $16, SI
+ ADDQ $16, DI
+ // AX = bit mask of differences
+diff16:
+ BSFQ AX, BX // index of first byte that differs
+ XORQ AX, AX
+ MOVB (SI)(BX*1), CX
+ CMPB CX, (DI)(BX*1)
+ SETHI AX
+ LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
+ MOVQ AX, (R9)
+ RET
+
+ // 0 through 16 bytes left, alen>=8, blen>=8
+_0through16:
+ CMPQ R8, $8
+ JBE _0through8
+ MOVQ (SI), AX
+ MOVQ (DI), CX
+ CMPQ AX, CX
+ JNE diff8
+_0through8:
+ MOVQ -8(SI)(R8*1), AX
+ MOVQ -8(DI)(R8*1), CX
+ CMPQ AX, CX
+ JEQ allsame
+
+ // AX and CX contain parts of a and b that differ.
+diff8:
+ BSWAPQ AX // reverse order of bytes
+ BSWAPQ CX
+ XORQ AX, CX
+ BSRQ CX, CX // index of highest bit difference
+ SHRQ CX, AX // move a's bit to bottom
+ ANDQ $1, AX // mask bit
+ LEAQ -1(AX*2), AX // 1/0 => +1/-1
+ MOVQ AX, (R9)
+ RET
+
+ // 0-7 bytes in common
+small:
+ LEAQ (R8*8), CX // bytes left -> bits left
+ NEGQ CX // - bits lift (== 64 - bits left mod 64)
+ JEQ allsame
+
+ // load bytes of a into high bytes of AX
+ CMPB SI, $0xf8
+ JA si_high
+ MOVQ (SI), SI
+ JMP si_finish
+si_high:
+ MOVQ -8(SI)(R8*1), SI
+ SHRQ CX, SI
+si_finish:
+ SHLQ CX, SI
+
+ // load bytes of b in to high bytes of BX
+ CMPB DI, $0xf8
+ JA di_high
+ MOVQ (DI), DI
+ JMP di_finish
+di_high:
+ MOVQ -8(DI)(R8*1), DI
+ SHRQ CX, DI
+di_finish:
+ SHLQ CX, DI
+
+ BSWAPQ SI // reverse order of bytes
+ BSWAPQ DI
+ XORQ SI, DI // find bit differences
+ JEQ allsame
+ BSRQ DI, CX // index of highest bit difference
+ SHRQ CX, SI // move a's bit to bottom
+ ANDQ $1, SI // mask bit
+ LEAQ -1(SI*2), AX // 1/0 => +1/-1
+ MOVQ AX, (R9)
+ RET
+
+allsame:
+ XORQ AX, AX
+ XORQ CX, CX
+ CMPQ BX, DX
+ SETGT AX // 1 if alen > blen
+ SETEQ CX // 1 if alen == blen
+ LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
+ MOVQ AX, (R9)
+ RET
+
+ // this works for >= 64 bytes of data.
+big_loop:
+ MOVOU (SI), X0
+ MOVOU (DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff16
+
+ MOVOU 16(SI), X0
+ MOVOU 16(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff32
+
+ MOVOU 32(SI), X0
+ MOVOU 32(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff48
+
+ MOVOU 48(SI), X0
+ MOVOU 48(DI), X1
+ PCMPEQB X0, X1
+ PMOVMSKB X1, AX
+ XORQ $0xffff, AX
+ JNE diff64
+
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, R8
+ CMPQ R8, $64
+ JBE loop
+ JMP big_loop
+
+ // Compare 64-bytes per loop iteration.
+ // Loop is unrolled and uses AVX2.
+big_loop_avx2:
+ VMOVDQU (SI), Y2
+ VMOVDQU (DI), Y3
+ VMOVDQU 32(SI), Y4
+ VMOVDQU 32(DI), Y5
+ VPCMPEQB Y2, Y3, Y0
+ VPMOVMSKB Y0, AX
+ XORL $0xffffffff, AX
+ JNE diff32_avx2
+ VPCMPEQB Y4, Y5, Y6
+ VPMOVMSKB Y6, AX
+ XORL $0xffffffff, AX
+ JNE diff64_avx2
+
+ ADDQ $64, SI
+ ADDQ $64, DI
+ SUBQ $64, R8
+ CMPQ R8, $64
+ JB big_loop_avx2_exit
+ JMP big_loop_avx2
+
+ // Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+ VZEROUPPER
+ JMP diff16
+
+ // Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+ VZEROUPPER
+ JMP diff48
+
+ // For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+ VZEROUPPER
+ JMP loop