summaryrefslogtreecommitdiffstats
path: root/src/internal/bytealg/equal_arm64.s
blob: d3aabba5871eedf52d848decc3bf3838a10a7ae4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "go_asm.h"
#include "textflag.h"

// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
	// short path to handle 0-byte case
	CBZ	R2, equal
	B	memeqbody<>(SB)
equal:
	MOVD	$1, R0
	RET

// memequal_varlen(a, b unsafe.Pointer) bool
TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
	CMP	R0, R1
	BEQ	eq
	MOVD	8(R26), R2    // compiler stores size at offset 8 in the closure
	CBZ	R2, eq
	B	memeqbody<>(SB)
eq:
	MOVD	$1, R0
	RET

// input:
// R0: pointer a
// R1: pointer b
// R2: data len
// at return: result in R0
TEXT memeqbody<>(SB),NOSPLIT,$0
	CMP	$1, R2
	// handle 1-byte special case for better performance
	BEQ	one
	CMP	$16, R2
	// handle specially if length < 16
	BLO	tail
	BIC	$0x3f, R2, R3
	CBZ	R3, chunk16
	// work with 64-byte chunks
	ADD	R3, R0, R6	// end of chunks
chunk64_loop:
	VLD1.P	(R0), [V0.D2, V1.D2, V2.D2, V3.D2]
	VLD1.P	(R1), [V4.D2, V5.D2, V6.D2, V7.D2]
	VCMEQ	V0.D2, V4.D2, V8.D2
	VCMEQ	V1.D2, V5.D2, V9.D2
	VCMEQ	V2.D2, V6.D2, V10.D2
	VCMEQ	V3.D2, V7.D2, V11.D2
	VAND	V8.B16, V9.B16, V8.B16
	VAND	V8.B16, V10.B16, V8.B16
	VAND	V8.B16, V11.B16, V8.B16
	CMP	R0, R6
	VMOV	V8.D[0], R4
	VMOV	V8.D[1], R5
	CBZ	R4, not_equal
	CBZ	R5, not_equal
	BNE	chunk64_loop
	AND	$0x3f, R2, R2
	CBZ	R2, equal
chunk16:
	// work with 16-byte chunks
	BIC	$0xf, R2, R3
	CBZ	R3, tail
	ADD	R3, R0, R6	// end of chunks
chunk16_loop:
	LDP.P	16(R0), (R4, R5)
	LDP.P	16(R1), (R7, R9)
	EOR	R4, R7
	CBNZ	R7, not_equal
	EOR	R5, R9
	CBNZ	R9, not_equal
	CMP	R0, R6
	BNE	chunk16_loop
	AND	$0xf, R2, R2
	CBZ	R2, equal
tail:
	// special compare of tail with length < 16
	TBZ	$3, R2, lt_8
	MOVD	(R0), R4
	MOVD	(R1), R5
	EOR	R4, R5
	CBNZ	R5, not_equal
	SUB	$8, R2, R6	// offset of the last 8 bytes
	MOVD	(R0)(R6), R4
	MOVD	(R1)(R6), R5
	EOR	R4, R5
	CBNZ	R5, not_equal
	B	equal
lt_8:
	TBZ	$2, R2, lt_4
	MOVWU	(R0), R4
	MOVWU	(R1), R5
	EOR	R4, R5
	CBNZ	R5, not_equal
	SUB	$4, R2, R6	// offset of the last 4 bytes
	MOVWU	(R0)(R6), R4
	MOVWU	(R1)(R6), R5
	EOR	R4, R5
	CBNZ	R5, not_equal
	B	equal
lt_4:
	TBZ	$1, R2, lt_2
	MOVHU.P	2(R0), R4
	MOVHU.P	2(R1), R5
	CMP	R4, R5
	BNE	not_equal
lt_2:
	TBZ	$0, R2, equal
one:
	MOVBU	(R0), R4
	MOVBU	(R1), R5
	CMP	R4, R5
	BNE	not_equal
equal:
	MOVD	$1, R0
	RET
not_equal:
	MOVB	ZR, R0
	RET