src/runtime/memclr_ppc64x.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64 || ppc64le

#include "textflag.h"

// See memclrNoHeapPointers Go doc for important implementation constraints.

// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-16
#ifndef GOEXPERIMENT_regabiargs
	MOVD ptr+0(FP), R3
	MOVD n+8(FP), R4
#endif

	// Determine if there are doublewords to clear
check:
	ANDCC $7, R4, R5  // R5: leftover bytes to clear
	SRD   $3, R4, R6  // R6: double words to clear
	CMP   R6, $0, CR1 // CR1[EQ] set if no double words

	BC    12, 6, nozerolarge // only single bytes
	CMP   R4, $512
	BLT   under512           // special case for < 512
	ANDCC $127, R3, R8       // check for 128 alignment of address
	BEQ   zero512setup

	ANDCC $7, R3, R15
	BEQ   zero512xsetup // at least 8 byte aligned

	// zero bytes up to 8 byte alignment

	ANDCC $1, R3, R15 // check for byte alignment
	BEQ   byte2
	MOVB  R0, 0(R3)   // zero 1 byte
	ADD   $1, R3      // bump ptr by 1
	ADD   $-1, R4

byte2:
	ANDCC $2, R3, R15 // check for 2 byte alignment
	BEQ   byte4
	MOVH  R0, 0(R3)   // zero 2 bytes
	ADD   $2, R3      // bump ptr by 2
	ADD   $-2, R4

byte4:
	ANDCC $4, R3, R15   // check for 4 byte alignment
	BEQ   zero512xsetup
	MOVW  R0, 0(R3)     // zero 4 bytes
	ADD   $4, R3        // bump ptr by 4
	ADD   $-4, R4
	BR    zero512xsetup // ptr should now be 8 byte aligned

under512:
	MOVD  R6, CTR     // R6 = number of double words
	SRDCC $2, R6, R7  // 32 byte chunks?
	BNE   zero32setup

	// Clear double words

zero8:
	MOVD R0, 0(R3)    // double word
	ADD  $8, R3
	ADD  $-8, R4
	BC   16, 0, zero8 // dec ctr, br zero8 if ctr not 0
	BR   nozerolarge  // handle leftovers

	// Prepare to clear 32 bytes at a time.

zero32setup:
	DCBTST (R3)             // prepare data cache
	XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
	MOVD   R7, CTR          // number of 32 byte chunks
	MOVD   $16, R8

zero32:
	STXVD2X VS32, (R3+R0)   // store 16 bytes
	STXVD2X VS32, (R3+R8)
	ADD     $32, R3
	ADD     $-32, R4
	BC      16, 0, zero32   // dec ctr, br zero32 if ctr not 0
	RLDCLCC $61, R4, $3, R6 // remaining doublewords
	BEQ     nozerolarge
	MOVD    R6, CTR         // set up the CTR for doublewords
	BR      zero8

nozerolarge:
	ANDCC $7, R4, R5 // any remaining bytes
	BC    4, 1, LR   // ble lr

zerotail:
	MOVD R5, CTR // set up to clear tail bytes

zerotailloop:
	MOVB R0, 0(R3)           // clear single bytes
	ADD  $1, R3
	BC   16, 0, zerotailloop // dec ctr, br zerotailloop if ctr not 0
	RET

zero512xsetup:  // 512 chunk with extra needed
	ANDCC $8, R3, R11    // 8 byte alignment?
	BEQ   zero512setup16
	MOVD  R0, 0(R3)      // clear 8 bytes
	ADD   $8, R3         // update ptr to next 8
	ADD   $-8, R4        // dec count by 8

zero512setup16:
	ANDCC $127, R3, R14 // < 128 byte alignment
	BEQ   zero512setup  // handle 128 byte alignment
	MOVD  $128, R15
	SUB   R14, R15, R14 // find increment to 128 alignment
	SRD   $4, R14, R15  // number of 16 byte chunks

zero512presetup:
	MOVD   R15, CTR         // loop counter of 16 bytes
	XXLXOR VS32, VS32, VS32 // clear VS32 (V0)

zero512preloop:  // clear up to 128 alignment
	STXVD2X VS32, (R3+R0)         // clear 16 bytes
	ADD     $16, R3               // update ptr
	ADD     $-16, R4              // dec count
	BC      16, 0, zero512preloop

zero512setup:  // setup for dcbz loop
	CMP  R4, $512   // check if at least 512
	BLT  remain
	SRD  $9, R4, R8 // loop count for 512 chunks
	MOVD R8, CTR    // set up counter
	MOVD $128, R9   // index regs for 128 bytes
	MOVD $256, R10
	MOVD $384, R11

zero512:
	DCBZ (R3+R0)        // clear first chunk
	DCBZ (R3+R9)        // clear second chunk
	DCBZ (R3+R10)       // clear third chunk
	DCBZ (R3+R11)       // clear fourth chunk
	ADD  $512, R3
	ADD  $-512, R4
	BC   16, 0, zero512

remain:
	CMP  R4, $128  // check if 128 byte chunks left
	BLT  smaller
	DCBZ (R3+R0)   // clear 128
	ADD  $128, R3
	ADD  $-128, R4
	BR   remain

smaller:
	ANDCC $127, R4, R7 // find leftovers
	BEQ   done
	CMP   R7, $64      // more than 64, do 32 at a time
	BLT   zero8setup   // less than 64, do 8 at a time
	SRD   $5, R7, R7   // set up counter for 32
	BR    zero32setup

zero8setup:
	SRDCC $3, R7, R7  // less than 8 bytes
	BEQ   nozerolarge
	MOVD  R7, CTR
	BR    zero8

done:
	RET