summaryrefslogtreecommitdiffstats
path: root/src/runtime/memmove_ppc64x.s
blob: 5fa51c0a4cf9d673c9f6d8775bec3bf8ec821150 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build ppc64 || ppc64le

#include "textflag.h"

// See memmove Go doc for important implementation constraints.

// func memmove(to, from unsafe.Pointer, n uintptr)

// target address
#define TGT R3
// source address
#define SRC R4
// length to move
#define LEN R5
// number of doublewords
#define DWORDS R6
// number of bytes < 8
#define BYTES R7
// const 16 used as index
#define IDX16 R8
// temp used for copies, etc.
#define TMP R9
// number of 64 byte chunks
#define QWORDS R10
// index values
#define IDX32 R14
#define IDX48 R15
#define OCTWORDS R16

TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
	// R3 = TGT = to
	// R4 = SRC = from
	// R5 = LEN = n

	// Determine if there are doublewords to
	// copy so a more efficient move can be done
check:
	ANDCC	$7, LEN, BYTES	// R7: bytes to copy
	SRD	$3, LEN, DWORDS	// R6: double words to copy
	MOVFL	CR0, CR3	// save CR from ANDCC
	CMP	DWORDS, $0, CR1	// CR1[EQ] set if no double words to copy

	// Determine overlap by subtracting dest - src and comparing against the
	// length.  This catches the cases where src and dest are in different types
	// of storage such as stack and static to avoid doing backward move when not
	// necessary.

	SUB	SRC, TGT, TMP	// dest - src
	CMPU	TMP, LEN, CR2	// < len?
	BC	12, 8, backward // BLT CR2 backward

	// Copying forward if no overlap.

	BC	12, 6, checkbytes	// BEQ CR1, checkbytes
	SRDCC	$3, DWORDS, OCTWORDS	// 64 byte chunks?
	MOVD	$16, IDX16
	BEQ	lt64gt8			// < 64 bytes

	// Prepare for moves of 64 bytes at a time.

forward64setup:
	DCBTST	(TGT)			// prepare data cache
	DCBT	(SRC)
	MOVD	OCTWORDS, CTR		// Number of 64 byte chunks
	MOVD	$32, IDX32
	MOVD	$48, IDX48
	PCALIGN	$32

forward64:
	LXVD2X	(R0)(SRC), VS32		// load 64 bytes
	LXVD2X	(IDX16)(SRC), VS33
	LXVD2X	(IDX32)(SRC), VS34
	LXVD2X	(IDX48)(SRC), VS35
	ADD	$64, SRC
	STXVD2X	VS32, (R0)(TGT)		// store 64 bytes
	STXVD2X	VS33, (IDX16)(TGT)
	STXVD2X	VS34, (IDX32)(TGT)
	STXVD2X VS35, (IDX48)(TGT)
	ADD	$64,TGT			// bump up for next set
	BC	16, 0, forward64	// continue
	ANDCC	$7, DWORDS		// remaining doublewords
	BEQ	checkbytes		// only bytes remain

lt64gt8:
	CMP	DWORDS, $4
	BLT	lt32gt8
	LXVD2X	(R0)(SRC), VS32
	LXVD2X	(IDX16)(SRC), VS33
	ADD	$-4, DWORDS
	STXVD2X	VS32, (R0)(TGT)
	STXVD2X	VS33, (IDX16)(TGT)
	ADD	$32, SRC
	ADD	$32, TGT

lt32gt8:
        // At this point >= 8 and < 32
	// Move 16 bytes if possible
	CMP     DWORDS, $2
	BLT     lt16
	LXVD2X	(R0)(SRC), VS32
	ADD	$-2, DWORDS
	STXVD2X	VS32, (R0)(TGT)
	ADD     $16, SRC
	ADD     $16, TGT

lt16:	// Move 8 bytes if possible
	CMP     DWORDS, $1
	BLT     checkbytes
	MOVD    0(SRC), TMP
	ADD	$8, SRC
	MOVD    TMP, 0(TGT)
	ADD     $8, TGT
checkbytes:
	BC	12, 14, LR		// BEQ lr
lt8:	// Move word if possible
	CMP BYTES, $4
	BLT lt4
	MOVWZ 0(SRC), TMP
	ADD $-4, BYTES
	MOVW TMP, 0(TGT)
	ADD $4, SRC
	ADD $4, TGT
lt4:	// Move halfword if possible
	CMP BYTES, $2
	BLT lt2
	MOVHZ 0(SRC), TMP
	ADD $-2, BYTES
	MOVH TMP, 0(TGT)
	ADD $2, SRC
	ADD $2, TGT
lt2:	// Move last byte if 1 left
	CMP BYTES, $1
	BC 12, 0, LR	// ble lr
	MOVBZ 0(SRC), TMP
	MOVBZ TMP, 0(TGT)
	RET

backward:
	// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
	// R3 and R4 are advanced to the end of the destination/source buffers
	// respectively and moved back as we copy.

	ADD	LEN, SRC, SRC		// end of source
	ADD	TGT, LEN, TGT		// end of dest

	BEQ	nobackwardtail		// earlier condition

	MOVD	BYTES, CTR			// bytes to move

backwardtailloop:
	MOVBZ 	-1(SRC), TMP		// point to last byte
	SUB	$1,SRC
	MOVBZ 	TMP, -1(TGT)
	SUB	$1,TGT
	BDNZ	backwardtailloop

nobackwardtail:
	BC	4, 5, LR		// blelr cr1, return if DWORDS == 0
	SRDCC	$2,DWORDS,QWORDS	// Compute number of 32B blocks and compare to 0
	BNE	backward32setup		// If QWORDS != 0, start the 32B copy loop.

backward24:
	// DWORDS is a value between 1-3.
	CMP	DWORDS, $2

	MOVD 	-8(SRC), TMP
	MOVD 	TMP, -8(TGT)
	BC	12, 0, LR		// bltlr, return if DWORDS == 1

	MOVD 	-16(SRC), TMP
	MOVD 	TMP, -16(TGT)
	BC	12, 2, LR		// beqlr, return if DWORDS == 2

	MOVD 	-24(SRC), TMP
	MOVD 	TMP, -24(TGT)
	RET

backward32setup:
	ANDCC   $3,DWORDS		// Compute remaining DWORDS and compare to 0
	MOVD	QWORDS, CTR		// set up loop ctr
	MOVD	$16, IDX16		// 32 bytes at a time

backward32loop:
	SUB	$32, TGT
	SUB	$32, SRC
	LXVD2X	(R0)(SRC), VS32		// load 16x2 bytes
	LXVD2X	(IDX16)(SRC), VS33
	STXVD2X	VS32, (R0)(TGT)		// store 16x2 bytes
	STXVD2X	VS33, (IDX16)(TGT)
	BDNZ	backward32loop
	BC	12, 2, LR		// beqlr, return if DWORDS == 0
	BR	backward24