1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
|
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// See memclrNoHeapPointers Go doc for important implementation constraints.
// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
// Also called from assembly in sys_windows_arm64.s without g (but using Go stack convention).
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
#ifndef GOEXPERIMENT_regabiargs
MOVD ptr+0(FP), R0
MOVD n+8(FP), R1
#endif
CMP $16, R1
// If n is equal to 16 bytes, use zero_exact_16 to zero
BEQ zero_exact_16
// If n is greater than 16 bytes, use zero_by_16 to zero
BHI zero_by_16
// n is less than 16 bytes
ADD R1, R0, R7
TBZ $3, R1, less_than_8
MOVD ZR, (R0)
MOVD ZR, -8(R7)
RET
less_than_8:
TBZ $2, R1, less_than_4
MOVW ZR, (R0)
MOVW ZR, -4(R7)
RET
less_than_4:
CBZ R1, ending
MOVB ZR, (R0)
TBZ $1, R1, ending
MOVH ZR, -2(R7)
ending:
RET
zero_exact_16:
// n is exactly 16 bytes
STP (ZR, ZR), (R0)
RET
zero_by_16:
// n greater than 16 bytes, check if the start address is aligned
NEG R0, R4
ANDS $15, R4, R4
// Try zeroing using zva if the start address is aligned with 16
BEQ try_zva
// Non-aligned store
STP (ZR, ZR), (R0)
// Make the destination aligned
SUB R4, R1, R1
ADD R4, R0, R0
B try_zva
tail_maybe_long:
CMP $64, R1
BHS no_zva
tail63:
ANDS $48, R1, R3
BEQ last16
CMPW $32, R3
BEQ last48
BLT last32
STP.P (ZR, ZR), 16(R0)
last48:
STP.P (ZR, ZR), 16(R0)
last32:
STP.P (ZR, ZR), 16(R0)
// The last store length is at most 16, so it is safe to use
// stp to write last 16 bytes
last16:
ANDS $15, R1, R1
CBZ R1, last_end
ADD R1, R0, R0
STP (ZR, ZR), -16(R0)
last_end:
RET
no_zva:
SUB $16, R0, R0
SUB $64, R1, R1
loop_64:
STP (ZR, ZR), 16(R0)
STP (ZR, ZR), 32(R0)
STP (ZR, ZR), 48(R0)
STP.W (ZR, ZR), 64(R0)
SUBS $64, R1, R1
BGE loop_64
ANDS $63, R1, ZR
ADD $16, R0, R0
BNE tail63
RET
try_zva:
// Try using the ZVA feature to zero entire cache lines
// It is not meaningful to use ZVA if the block size is less than 64,
// so make sure that n is greater than or equal to 64
CMP $63, R1
BLE tail63
CMP $128, R1
// Ensure n is at least 128 bytes, so that there is enough to copy after
// alignment.
BLT no_zva
// Check if ZVA is allowed from user code, and if so get the block size
MOVW block_size<>(SB), R5
TBNZ $31, R5, no_zva
CBNZ R5, zero_by_line
// DCZID_EL0 bit assignments
// [63:5] Reserved
// [4] DZP, if bit set DC ZVA instruction is prohibited, else permitted
// [3:0] log2 of the block size in words, eg. if it returns 0x4 then block size is 16 words
MRS DCZID_EL0, R3
TBZ $4, R3, init
// ZVA not available
MOVW $~0, R5
MOVW R5, block_size<>(SB)
B no_zva
init:
MOVW $4, R9
ANDW $15, R3, R5
LSLW R5, R9, R5
MOVW R5, block_size<>(SB)
ANDS $63, R5, R9
// Block size is less than 64.
BNE no_zva
zero_by_line:
CMP R5, R1
// Not enough memory to reach alignment
BLO no_zva
SUB $1, R5, R6
NEG R0, R4
ANDS R6, R4, R4
// Already aligned
BEQ aligned
// check there is enough to copy after alignment
SUB R4, R1, R3
// Check that the remaining length to ZVA after alignment
// is greater than 64.
CMP $64, R3
CCMP GE, R3, R5, $10 // condition code GE, NZCV=0b1010
BLT no_zva
// We now have at least 64 bytes to zero, update n
MOVD R3, R1
loop_zva_prolog:
STP (ZR, ZR), (R0)
STP (ZR, ZR), 16(R0)
STP (ZR, ZR), 32(R0)
SUBS $64, R4, R4
STP (ZR, ZR), 48(R0)
ADD $64, R0, R0
BGE loop_zva_prolog
ADD R4, R0, R0
aligned:
SUB R5, R1, R1
loop_zva:
WORD $0xd50b7420 // DC ZVA, R0
ADD R5, R0, R0
SUBS R5, R1, R1
BHS loop_zva
ANDS R6, R1, R1
BNE tail_maybe_long
RET
GLOBL block_size<>(SB), NOPTR, $8
|