summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/equal_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/internal/bytealg/equal_arm64.s')
-rw-r--r--src/internal/bytealg/equal_arm64.s121
1 files changed, 121 insertions, 0 deletions
diff --git a/src/internal/bytealg/equal_arm64.s b/src/internal/bytealg/equal_arm64.s
new file mode 100644
index 0000000..d3aabba
--- /dev/null
+++ b/src/internal/bytealg/equal_arm64.s
@@ -0,0 +1,121 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
+ // short path to handle 0-byte case
+ CBZ R2, equal
+ B memeqbody<>(SB)
+equal:
+ MOVD $1, R0
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+ CMP R0, R1
+ BEQ eq
+ MOVD 8(R26), R2 // compiler stores size at offset 8 in the closure
+ CBZ R2, eq
+ B memeqbody<>(SB)
+eq:
+ MOVD $1, R0
+ RET
+
+// input:
+// R0: pointer a
+// R1: pointer b
+// R2: data len
+// at return: result in R0
+TEXT memeqbody<>(SB),NOSPLIT,$0
+ CMP $1, R2
+ // handle 1-byte special case for better performance
+ BEQ one
+ CMP $16, R2
+ // handle specially if length < 16
+ BLO tail
+ BIC $0x3f, R2, R3
+ CBZ R3, chunk16
+ // work with 64-byte chunks
+ ADD R3, R0, R6 // end of chunks
+chunk64_loop:
+ VLD1.P (R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+ VLD1.P (R1), [V4.D2, V5.D2, V6.D2, V7.D2]
+ VCMEQ V0.D2, V4.D2, V8.D2
+ VCMEQ V1.D2, V5.D2, V9.D2
+ VCMEQ V2.D2, V6.D2, V10.D2
+ VCMEQ V3.D2, V7.D2, V11.D2
+ VAND V8.B16, V9.B16, V8.B16
+ VAND V8.B16, V10.B16, V8.B16
+ VAND V8.B16, V11.B16, V8.B16
+ CMP R0, R6
+ VMOV V8.D[0], R4
+ VMOV V8.D[1], R5
+ CBZ R4, not_equal
+ CBZ R5, not_equal
+ BNE chunk64_loop
+ AND $0x3f, R2, R2
+ CBZ R2, equal
+chunk16:
+ // work with 16-byte chunks
+ BIC $0xf, R2, R3
+ CBZ R3, tail
+ ADD R3, R0, R6 // end of chunks
+chunk16_loop:
+ LDP.P 16(R0), (R4, R5)
+ LDP.P 16(R1), (R7, R9)
+ EOR R4, R7
+ CBNZ R7, not_equal
+ EOR R5, R9
+ CBNZ R9, not_equal
+ CMP R0, R6
+ BNE chunk16_loop
+ AND $0xf, R2, R2
+ CBZ R2, equal
+tail:
+ // special compare of tail with length < 16
+ TBZ $3, R2, lt_8
+ MOVD (R0), R4
+ MOVD (R1), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ SUB $8, R2, R6 // offset of the last 8 bytes
+ MOVD (R0)(R6), R4
+ MOVD (R1)(R6), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ B equal
+lt_8:
+ TBZ $2, R2, lt_4
+ MOVWU (R0), R4
+ MOVWU (R1), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ SUB $4, R2, R6 // offset of the last 4 bytes
+ MOVWU (R0)(R6), R4
+ MOVWU (R1)(R6), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ B equal
+lt_4:
+ TBZ $1, R2, lt_2
+ MOVHU.P 2(R0), R4
+ MOVHU.P 2(R1), R5
+ CMP R4, R5
+ BNE not_equal
+lt_2:
+ TBZ $0, R2, equal
+one:
+ MOVBU (R0), R4
+ MOVBU (R1), R5
+ CMP R4, R5
+ BNE not_equal
+equal:
+ MOVD $1, R0
+ RET
+not_equal:
+ MOVB ZR, R0
+ RET