diff options
Diffstat (limited to 'src/internal/bytealg/equal_386.s')
-rw-r--r-- | src/internal/bytealg/equal_386.s | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/src/internal/bytealg/equal_386.s b/src/internal/bytealg/equal_386.s new file mode 100644 index 0000000..8723363 --- /dev/null +++ b/src/internal/bytealg/equal_386.s @@ -0,0 +1,129 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +// memequal(a, b unsafe.Pointer, size uintptr) bool +TEXT runtime·memequal(SB),NOSPLIT,$0-13 + MOVL a+0(FP), SI + MOVL b+4(FP), DI + CMPL SI, DI + JEQ eq + MOVL size+8(FP), BX + LEAL ret+12(FP), AX + JMP memeqbody<>(SB) +eq: + MOVB $1, ret+12(FP) + RET + +// memequal_varlen(a, b unsafe.Pointer) bool +TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9 + MOVL a+0(FP), SI + MOVL b+4(FP), DI + CMPL SI, DI + JEQ eq + MOVL 4(DX), BX // compiler stores size at offset 4 in the closure + LEAL ret+8(FP), AX + JMP memeqbody<>(SB) +eq: + MOVB $1, ret+8(FP) + RET + +// a in SI +// b in DI +// count in BX +// address of result byte in AX +TEXT memeqbody<>(SB),NOSPLIT,$0-0 + CMPL BX, $4 + JB small + + // 64 bytes at a time using xmm registers +hugeloop: + CMPL BX, $64 + JB bigloop + CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1 + JNE bigloop + MOVOU (SI), X0 + MOVOU (DI), X1 + MOVOU 16(SI), X2 + MOVOU 16(DI), X3 + MOVOU 32(SI), X4 + MOVOU 32(DI), X5 + MOVOU 48(SI), X6 + MOVOU 48(DI), X7 + PCMPEQB X1, X0 + PCMPEQB X3, X2 + PCMPEQB X5, X4 + PCMPEQB X7, X6 + PAND X2, X0 + PAND X6, X4 + PAND X4, X0 + PMOVMSKB X0, DX + ADDL $64, SI + ADDL $64, DI + SUBL $64, BX + CMPL DX, $0xffff + JEQ hugeloop + MOVB $0, (AX) + RET + + // 4 bytes at a time using 32-bit register +bigloop: + CMPL BX, $4 + JBE leftover + MOVL (SI), CX + MOVL (DI), DX + ADDL $4, SI + ADDL $4, DI + SUBL $4, BX + CMPL CX, DX + JEQ bigloop + MOVB $0, (AX) + RET + + // remaining 0-4 bytes +leftover: + MOVL -4(SI)(BX*1), CX + MOVL -4(DI)(BX*1), DX + CMPL CX, DX + SETEQ (AX) + RET + +small: + CMPL BX, $0 + JEQ equal + + LEAL 0(BX*8), CX + NEGL CX + + MOVL SI, DX + CMPB DX, $0xfc + JA si_high + + // load at SI won't cross a page boundary. + MOVL (SI), SI + JMP si_finish +si_high: + // address ends in 111111xx. Load up to bytes we want, move to correct position. + MOVL -4(SI)(BX*1), SI + SHRL CX, SI +si_finish: + + // same for DI. + MOVL DI, DX + CMPB DX, $0xfc + JA di_high + MOVL (DI), DI + JMP di_finish +di_high: + MOVL -4(DI)(BX*1), DI + SHRL CX, DI +di_finish: + + SUBL SI, DI + SHLL CX, DI +equal: + SETEQ (AX) + RET |