1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
|
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64 || ppc64le
#include "textflag.h"
// See memclrNoHeapPointers Go doc for important implementation constraints.
// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-16
// R3 = ptr
// R4 = n
// Determine if there are doublewords to clear
check:
ANDCC $7, R4, R5 // R5: leftover bytes to clear
SRD $3, R4, R6 // R6: double words to clear
CMP R6, $0, CR1 // CR1[EQ] set if no double words
BC 12, 6, nozerolarge // only single bytes
CMP R4, $512
BLT under512 // special case for < 512
ANDCC $127, R3, R8 // check for 128 alignment of address
BEQ zero512setup
ANDCC $7, R3, R15
BEQ zero512xsetup // at least 8 byte aligned
// zero bytes up to 8 byte alignment
ANDCC $1, R3, R15 // check for byte alignment
BEQ byte2
MOVB R0, 0(R3) // zero 1 byte
ADD $1, R3 // bump ptr by 1
ADD $-1, R4
byte2:
ANDCC $2, R3, R15 // check for 2 byte alignment
BEQ byte4
MOVH R0, 0(R3) // zero 2 bytes
ADD $2, R3 // bump ptr by 2
ADD $-2, R4
byte4:
ANDCC $4, R3, R15 // check for 4 byte alignment
BEQ zero512xsetup
MOVW R0, 0(R3) // zero 4 bytes
ADD $4, R3 // bump ptr by 4
ADD $-4, R4
BR zero512xsetup // ptr should now be 8 byte aligned
under512:
SRDCC $3, R6, R7 // 64 byte chunks?
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
BEQ lt64gt8
// Prepare to clear 64 bytes at a time.
zero64setup:
DCBTST (R3) // prepare data cache
MOVD R7, CTR // number of 64 byte chunks
MOVD $16, R8
MOVD $32, R16
MOVD $48, R17
zero64:
STXVD2X VS32, (R3+R0) // store 16 bytes
STXVD2X VS32, (R3+R8)
STXVD2X VS32, (R3+R16)
STXVD2X VS32, (R3+R17)
ADD $64, R3
ADD $-64, R4
BDNZ zero64 // dec ctr, br zero64 if ctr not 0
SRDCC $3, R4, R6 // remaining doublewords
BEQ nozerolarge
lt64gt8:
CMP R4, $32
BLT lt32gt8
MOVD $16, R8
STXVD2X VS32, (R3+R0)
STXVD2X VS32, (R3+R8)
ADD $-32, R4
ADD $32, R3
lt32gt8:
CMP R4, $16
BLT lt16gt8
STXVD2X VS32, (R3+R0)
ADD $16, R3
ADD $-16, R4
lt16gt8:
CMP R4, $8
BLT nozerolarge
MOVD R0, 0(R3)
ADD $8, R3
ADD $-8, R4
nozerolarge:
ANDCC $7, R4, R5 // any remaining bytes
BC 4, 1, LR // ble lr
zerotail:
MOVD R5, CTR // set up to clear tail bytes
zerotailloop:
MOVB R0, 0(R3) // clear single bytes
ADD $1, R3
BDNZ zerotailloop // dec ctr, br zerotailloop if ctr not 0
RET
zero512xsetup: // 512 chunk with extra needed
ANDCC $8, R3, R11 // 8 byte alignment?
BEQ zero512setup16
MOVD R0, 0(R3) // clear 8 bytes
ADD $8, R3 // update ptr to next 8
ADD $-8, R4 // dec count by 8
zero512setup16:
ANDCC $127, R3, R14 // < 128 byte alignment
BEQ zero512setup // handle 128 byte alignment
MOVD $128, R15
SUB R14, R15, R14 // find increment to 128 alignment
SRD $4, R14, R15 // number of 16 byte chunks
zero512presetup:
MOVD R15, CTR // loop counter of 16 bytes
XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
zero512preloop: // clear up to 128 alignment
STXVD2X VS32, (R3+R0) // clear 16 bytes
ADD $16, R3 // update ptr
ADD $-16, R4 // dec count
BDNZ zero512preloop
zero512setup: // setup for dcbz loop
CMP R4, $512 // check if at least 512
BLT remain
SRD $9, R4, R8 // loop count for 512 chunks
MOVD R8, CTR // set up counter
MOVD $128, R9 // index regs for 128 bytes
MOVD $256, R10
MOVD $384, R11
PCALIGN $32
zero512:
DCBZ (R3+R0) // clear first chunk
DCBZ (R3+R9) // clear second chunk
DCBZ (R3+R10) // clear third chunk
DCBZ (R3+R11) // clear fourth chunk
ADD $512, R3
BDNZ zero512
ANDCC $511, R4
remain:
CMP R4, $128 // check if 128 byte chunks left
BLT smaller
DCBZ (R3+R0) // clear 128
ADD $128, R3
ADD $-128, R4
BR remain
smaller:
ANDCC $127, R4, R7 // find leftovers
BEQ done
CMP R7, $64 // more than 64, do 64 at a time
XXLXOR VS32, VS32, VS32
BLT lt64gt8 // less than 64
SRD $6, R7, R7 // set up counter for 64
BR zero64setup
done:
RET
|