src/runtime/memclr_arm64.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// See memclrNoHeapPointers Go doc for important implementation constraints.

// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
// Also called from assembly in sys_windows_arm64.s without g (but using Go stack convention).
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
	CMP	$16, R1
	// If n is equal to 16 bytes, use zero_exact_16 to zero
	BEQ	zero_exact_16

	// If n is greater than 16 bytes, use zero_by_16 to zero
	BHI	zero_by_16

	// n is less than 16 bytes
	ADD	R1, R0, R7
	TBZ	$3, R1, less_than_8
	MOVD	ZR, (R0)
	MOVD	ZR, -8(R7)
	RET

less_than_8:
	TBZ	$2, R1, less_than_4
	MOVW	ZR, (R0)
	MOVW	ZR, -4(R7)
	RET

less_than_4:
	CBZ	R1, ending
	MOVB	ZR, (R0)
	TBZ	$1, R1, ending
	MOVH	ZR, -2(R7)

ending:
	RET

zero_exact_16:
	// n is exactly 16 bytes
	STP	(ZR, ZR), (R0)
	RET

zero_by_16:
	// n greater than 16 bytes, check if the start address is aligned
	NEG	R0, R4
	ANDS	$15, R4, R4
	// Try zeroing using zva if the start address is aligned with 16
	BEQ	try_zva

	// Non-aligned store
	STP	(ZR, ZR), (R0)
	// Make the destination aligned
	SUB	R4, R1, R1
	ADD	R4, R0, R0
	B	try_zva

tail_maybe_long:
	CMP	$64, R1
	BHS	no_zva

tail63:
	ANDS	$48, R1, R3
	BEQ	last16
	CMPW	$32, R3
	BEQ	last48
	BLT	last32
	STP.P	(ZR, ZR), 16(R0)
last48:
	STP.P	(ZR, ZR), 16(R0)
last32:
	STP.P	(ZR, ZR), 16(R0)
	// The last store length is at most 16, so it is safe to use
	// stp to write last 16 bytes
last16:
	ANDS	$15, R1, R1
	CBZ	R1, last_end
	ADD	R1, R0, R0
	STP	(ZR, ZR), -16(R0)
last_end:
	RET

no_zva:
	SUB	$16, R0, R0
	SUB	$64, R1, R1

loop_64:
	STP	(ZR, ZR), 16(R0)
	STP	(ZR, ZR), 32(R0)
	STP	(ZR, ZR), 48(R0)
	STP.W	(ZR, ZR), 64(R0)
	SUBS	$64, R1, R1
	BGE	loop_64
	ANDS	$63, R1, ZR
	ADD	$16, R0, R0
	BNE	tail63
	RET

try_zva:
	// Try using the ZVA feature to zero entire cache lines
	// It is not meaningful to use ZVA if the block size is less than 64,
	// so make sure that n is greater than or equal to 64
	CMP	$63, R1
	BLE	tail63

	CMP	$128, R1
	// Ensure n is at least 128 bytes, so that there is enough to copy after
	// alignment.
	BLT	no_zva
	// Check if ZVA is allowed from user code, and if so get the block size
	MOVW	block_size<>(SB), R5
	TBNZ	$31, R5, no_zva
	CBNZ	R5, zero_by_line
	// DCZID_EL0 bit assignments
	// [63:5] Reserved
	// [4]    DZP, if bit set DC ZVA instruction is prohibited, else permitted
	// [3:0]  log2 of the block size in words, eg. if it returns 0x4 then block size is 16 words
	MRS	DCZID_EL0, R3
	TBZ	$4, R3, init
	// ZVA not available
	MOVW	$~0, R5
	MOVW	R5, block_size<>(SB)
	B	no_zva

init:
	MOVW	$4, R9
	ANDW	$15, R3, R5
	LSLW	R5, R9, R5
	MOVW	R5, block_size<>(SB)

	ANDS	$63, R5, R9
	// Block size is less than 64.
	BNE	no_zva

zero_by_line:
	CMP	R5, R1
	// Not enough memory to reach alignment
	BLO	no_zva
	SUB	$1, R5, R6
	NEG	R0, R4
	ANDS	R6, R4, R4
	// Already aligned
	BEQ	aligned

	// check there is enough to copy after alignment
	SUB	R4, R1, R3

	// Check that the remaining length to ZVA after alignment
	// is greater than 64.
	CMP	$64, R3
	CCMP	GE, R3, R5, $10  // condition code GE, NZCV=0b1010
	BLT	no_zva

	// We now have at least 64 bytes to zero, update n
	MOVD	R3, R1

loop_zva_prolog:
	STP	(ZR, ZR), (R0)
	STP	(ZR, ZR), 16(R0)
	STP	(ZR, ZR), 32(R0)
	SUBS	$64, R4, R4
	STP	(ZR, ZR), 48(R0)
	ADD	$64, R0, R0
	BGE	loop_zva_prolog

	ADD	R4, R0, R0

aligned:
	SUB	R5, R1, R1

loop_zva:
	WORD	$0xd50b7420 // DC ZVA, R0
	ADD	R5, R0, R0
	SUBS	R5, R1, R1
	BHS	loop_zva
	ANDS	R6, R1, R1
	BNE	tail_maybe_long
	RET

GLOBL block_size<>(SB), NOPTR, $8