summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/equal_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/internal/bytealg/equal_arm64.s136
1 files changed, 136 insertions, 0 deletions
diff --git a/src/internal/bytealg/equal_arm64.s b/src/internal/bytealg/equal_arm64.s
new file mode 100644
index 0000000..01aa7b7
--- /dev/null
+++ b/src/internal/bytealg/equal_arm64.s
@@ -0,0 +1,136 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// memequal(a, b unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+ MOVD size+16(FP), R1
+ // short path to handle 0-byte case
+ CBZ R1, equal
+ MOVD a+0(FP), R0
+ MOVD b+8(FP), R2
+ MOVD $ret+24(FP), R8
+ B memeqbody<>(SB)
+equal:
+ MOVD $1, R0
+ MOVB R0, ret+24(FP)
+ RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+ MOVD a+0(FP), R3
+ MOVD b+8(FP), R4
+ CMP R3, R4
+ BEQ eq
+ MOVD 8(R26), R5 // compiler stores size at offset 8 in the closure
+ CBZ R5, eq
+ MOVD R3, 8(RSP)
+ MOVD R4, 16(RSP)
+ MOVD R5, 24(RSP)
+ BL runtime·memequal(SB)
+ MOVBU 32(RSP), R3
+ MOVB R3, ret+16(FP)
+ RET
+eq:
+ MOVD $1, R3
+ MOVB R3, ret+16(FP)
+ RET
+
+// input:
+// R0: pointer a
+// R1: data len
+// R2: pointer b
+// R8: address to put result
+TEXT memeqbody<>(SB),NOSPLIT,$0
+ CMP $1, R1
+ // handle 1-byte special case for better performance
+ BEQ one
+ CMP $16, R1
+ // handle specially if length < 16
+ BLO tail
+ BIC $0x3f, R1, R3
+ CBZ R3, chunk16
+ // work with 64-byte chunks
+ ADD R3, R0, R6 // end of chunks
+chunk64_loop:
+ VLD1.P (R0), [V0.D2, V1.D2, V2.D2, V3.D2]
+ VLD1.P (R2), [V4.D2, V5.D2, V6.D2, V7.D2]
+ VCMEQ V0.D2, V4.D2, V8.D2
+ VCMEQ V1.D2, V5.D2, V9.D2
+ VCMEQ V2.D2, V6.D2, V10.D2
+ VCMEQ V3.D2, V7.D2, V11.D2
+ VAND V8.B16, V9.B16, V8.B16
+ VAND V8.B16, V10.B16, V8.B16
+ VAND V8.B16, V11.B16, V8.B16
+ CMP R0, R6
+ VMOV V8.D[0], R4
+ VMOV V8.D[1], R5
+ CBZ R4, not_equal
+ CBZ R5, not_equal
+ BNE chunk64_loop
+ AND $0x3f, R1, R1
+ CBZ R1, equal
+chunk16:
+ // work with 16-byte chunks
+ BIC $0xf, R1, R3
+ CBZ R3, tail
+ ADD R3, R0, R6 // end of chunks
+chunk16_loop:
+ LDP.P 16(R0), (R4, R5)
+ LDP.P 16(R2), (R7, R9)
+ EOR R4, R7
+ CBNZ R7, not_equal
+ EOR R5, R9
+ CBNZ R9, not_equal
+ CMP R0, R6
+ BNE chunk16_loop
+ AND $0xf, R1, R1
+ CBZ R1, equal
+tail:
+ // special compare of tail with length < 16
+ TBZ $3, R1, lt_8
+ MOVD (R0), R4
+ MOVD (R2), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ SUB $8, R1, R6 // offset of the last 8 bytes
+ MOVD (R0)(R6), R4
+ MOVD (R2)(R6), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ B equal
+lt_8:
+ TBZ $2, R1, lt_4
+ MOVWU (R0), R4
+ MOVWU (R2), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ SUB $4, R1, R6 // offset of the last 4 bytes
+ MOVWU (R0)(R6), R4
+ MOVWU (R2)(R6), R5
+ EOR R4, R5
+ CBNZ R5, not_equal
+ B equal
+lt_4:
+ TBZ $1, R1, lt_2
+ MOVHU.P 2(R0), R4
+ MOVHU.P 2(R2), R5
+ CMP R4, R5
+ BNE not_equal
+lt_2:
+ TBZ $0, R1, equal
+one:
+ MOVBU (R0), R4
+ MOVBU (R2), R5
+ CMP R4, R5
+ BNE not_equal
+equal:
+ MOVD $1, R0
+ MOVB R0, (R8)
+ RET
+not_equal:
+ MOVB ZR, (R8)
+ RET