summaryrefslogtreecommitdiffstats
path: root/src/runtime/memmove_riscv64.s
blob: f5db86562b449cd73265cb5593bf9ba586e22f1e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// See memmove Go doc for important implementation constraints.

// void runtime·memmove(void*, void*, uintptr)
TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
	// X10 = to
	// X11 = from
	// X12 = n
	BEQ	X10, X11, done
	BEQZ	X12, done

	// If the destination is ahead of the source, start at the end of the
	// buffer and go backward.
	BGTU	X10, X11, backward

	// If less than 8 bytes, do single byte copies.
	MOV	$8, X9
	BLT	X12, X9, f_loop4_check

	// Check alignment - if alignment differs we have to do one byte at a time.
	AND	$7, X10, X5
	AND	$7, X11, X6
	BNE	X5, X6, f_loop8_unaligned_check
	BEQZ	X5, f_loop_check

	// Move one byte at a time until we reach 8 byte alignment.
	SUB	X5, X9, X5
	SUB	X5, X12, X12
f_align:
	ADD	$-1, X5
	MOVB	0(X11), X14
	MOVB	X14, 0(X10)
	ADD	$1, X10
	ADD	$1, X11
	BNEZ	X5, f_align

f_loop_check:
	MOV	$16, X9
	BLT	X12, X9, f_loop8_check
	MOV	$32, X9
	BLT	X12, X9, f_loop16_check
	MOV	$64, X9
	BLT	X12, X9, f_loop32_check
f_loop64:
	MOV	0(X11), X14
	MOV	8(X11), X15
	MOV	16(X11), X16
	MOV	24(X11), X17
	MOV	32(X11), X18
	MOV	40(X11), X19
	MOV	48(X11), X20
	MOV	56(X11), X21
	MOV	X14, 0(X10)
	MOV	X15, 8(X10)
	MOV	X16, 16(X10)
	MOV	X17, 24(X10)
	MOV	X18, 32(X10)
	MOV	X19, 40(X10)
	MOV	X20, 48(X10)
	MOV	X21, 56(X10)
	ADD	$64, X10
	ADD	$64, X11
	ADD	$-64, X12
	BGE	X12, X9, f_loop64
	BEQZ	X12, done

f_loop32_check:
	MOV	$32, X9
	BLT	X12, X9, f_loop16_check
f_loop32:
	MOV	0(X11), X14
	MOV	8(X11), X15
	MOV	16(X11), X16
	MOV	24(X11), X17
	MOV	X14, 0(X10)
	MOV	X15, 8(X10)
	MOV	X16, 16(X10)
	MOV	X17, 24(X10)
	ADD	$32, X10
	ADD	$32, X11
	ADD	$-32, X12
	BGE	X12, X9, f_loop32
	BEQZ	X12, done

f_loop16_check:
	MOV	$16, X9
	BLT	X12, X9, f_loop8_check
f_loop16:
	MOV	0(X11), X14
	MOV	8(X11), X15
	MOV	X14, 0(X10)
	MOV	X15, 8(X10)
	ADD	$16, X10
	ADD	$16, X11
	ADD	$-16, X12
	BGE	X12, X9, f_loop16
	BEQZ	X12, done

f_loop8_check:
	MOV	$8, X9
	BLT	X12, X9, f_loop4_check
f_loop8:
	MOV	0(X11), X14
	MOV	X14, 0(X10)
	ADD	$8, X10
	ADD	$8, X11
	ADD	$-8, X12
	BGE	X12, X9, f_loop8
	BEQZ	X12, done
	JMP	f_loop4_check

f_loop8_unaligned_check:
	MOV	$8, X9
	BLT	X12, X9, f_loop4_check
f_loop8_unaligned:
	MOVB	0(X11), X14
	MOVB	1(X11), X15
	MOVB	2(X11), X16
	MOVB	3(X11), X17
	MOVB	4(X11), X18
	MOVB	5(X11), X19
	MOVB	6(X11), X20
	MOVB	7(X11), X21
	MOVB	X14, 0(X10)
	MOVB	X15, 1(X10)
	MOVB	X16, 2(X10)
	MOVB	X17, 3(X10)
	MOVB	X18, 4(X10)
	MOVB	X19, 5(X10)
	MOVB	X20, 6(X10)
	MOVB	X21, 7(X10)
	ADD	$8, X10
	ADD	$8, X11
	ADD	$-8, X12
	BGE	X12, X9, f_loop8_unaligned

f_loop4_check:
	MOV	$4, X9
	BLT	X12, X9, f_loop1
f_loop4:
	MOVB	0(X11), X14
	MOVB	1(X11), X15
	MOVB	2(X11), X16
	MOVB	3(X11), X17
	MOVB	X14, 0(X10)
	MOVB	X15, 1(X10)
	MOVB	X16, 2(X10)
	MOVB	X17, 3(X10)
	ADD	$4, X10
	ADD	$4, X11
	ADD	$-4, X12
	BGE	X12, X9, f_loop4

f_loop1:
	BEQZ	X12, done
	MOVB	0(X11), X14
	MOVB	X14, 0(X10)
	ADD	$1, X10
	ADD	$1, X11
	ADD	$-1, X12
	JMP	f_loop1

backward:
	ADD	X10, X12, X10
	ADD	X11, X12, X11

	// If less than 8 bytes, do single byte copies.
	MOV	$8, X9
	BLT	X12, X9, b_loop4_check

	// Check alignment - if alignment differs we have to do one byte at a time.
	AND	$7, X10, X5
	AND	$7, X11, X6
	BNE	X5, X6, b_loop8_unaligned_check
	BEQZ	X5, b_loop_check

	// Move one byte at a time until we reach 8 byte alignment.
	SUB	X5, X12, X12
b_align:
	ADD	$-1, X5
	ADD	$-1, X10
	ADD	$-1, X11
	MOVB	0(X11), X14
	MOVB	X14, 0(X10)
	BNEZ	X5, b_align

b_loop_check:
	MOV	$16, X9
	BLT	X12, X9, b_loop8_check
	MOV	$32, X9
	BLT	X12, X9, b_loop16_check
	MOV	$64, X9
	BLT	X12, X9, b_loop32_check
b_loop64:
	ADD	$-64, X10
	ADD	$-64, X11
	MOV	0(X11), X14
	MOV	8(X11), X15
	MOV	16(X11), X16
	MOV	24(X11), X17
	MOV	32(X11), X18
	MOV	40(X11), X19
	MOV	48(X11), X20
	MOV	56(X11), X21
	MOV	X14, 0(X10)
	MOV	X15, 8(X10)
	MOV	X16, 16(X10)
	MOV	X17, 24(X10)
	MOV	X18, 32(X10)
	MOV	X19, 40(X10)
	MOV	X20, 48(X10)
	MOV	X21, 56(X10)
	ADD	$-64, X12
	BGE	X12, X9, b_loop64
	BEQZ	X12, done

b_loop32_check:
	MOV	$32, X9
	BLT	X12, X9, b_loop16_check
b_loop32:
	ADD	$-32, X10
	ADD	$-32, X11
	MOV	0(X11), X14
	MOV	8(X11), X15
	MOV	16(X11), X16
	MOV	24(X11), X17
	MOV	X14, 0(X10)
	MOV	X15, 8(X10)
	MOV	X16, 16(X10)
	MOV	X17, 24(X10)
	ADD	$-32, X12
	BGE	X12, X9, b_loop32
	BEQZ	X12, done

b_loop16_check:
	MOV	$16, X9
	BLT	X12, X9, b_loop8_check
b_loop16:
	ADD	$-16, X10
	ADD	$-16, X11
	MOV	0(X11), X14
	MOV	8(X11), X15
	MOV	X14, 0(X10)
	MOV	X15, 8(X10)
	ADD	$-16, X12
	BGE	X12, X9, b_loop16
	BEQZ	X12, done

b_loop8_check:
	MOV	$8, X9
	BLT	X12, X9, b_loop4_check
b_loop8:
	ADD	$-8, X10
	ADD	$-8, X11
	MOV	0(X11), X14
	MOV	X14, 0(X10)
	ADD	$-8, X12
	BGE	X12, X9, b_loop8
	BEQZ	X12, done
	JMP	b_loop4_check

b_loop8_unaligned_check:
	MOV	$8, X9
	BLT	X12, X9, b_loop4_check
b_loop8_unaligned:
	ADD	$-8, X10
	ADD	$-8, X11
	MOVB	0(X11), X14
	MOVB	1(X11), X15
	MOVB	2(X11), X16
	MOVB	3(X11), X17
	MOVB	4(X11), X18
	MOVB	5(X11), X19
	MOVB	6(X11), X20
	MOVB	7(X11), X21
	MOVB	X14, 0(X10)
	MOVB	X15, 1(X10)
	MOVB	X16, 2(X10)
	MOVB	X17, 3(X10)
	MOVB	X18, 4(X10)
	MOVB	X19, 5(X10)
	MOVB	X20, 6(X10)
	MOVB	X21, 7(X10)
	ADD	$-8, X12
	BGE	X12, X9, b_loop8_unaligned

b_loop4_check:
	MOV	$4, X9
	BLT	X12, X9, b_loop1
b_loop4:
	ADD	$-4, X10
	ADD	$-4, X11
	MOVB	0(X11), X14
	MOVB	1(X11), X15
	MOVB	2(X11), X16
	MOVB	3(X11), X17
	MOVB	X14, 0(X10)
	MOVB	X15, 1(X10)
	MOVB	X16, 2(X10)
	MOVB	X17, 3(X10)
	ADD	$-4, X12
	BGE	X12, X9, b_loop4

b_loop1:
	BEQZ	X12, done
	ADD	$-1, X10
	ADD	$-1, X11
	MOVB	0(X11), X14
	MOVB	X14, 0(X10)
	ADD	$-1, X12
	JMP	b_loop1

done:
	RET