summaryrefslogtreecommitdiffstats
path: root/src/runtime/memmove_riscv64.s
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/runtime/memmove_riscv64.s319
1 files changed, 319 insertions, 0 deletions
diff --git a/src/runtime/memmove_riscv64.s b/src/runtime/memmove_riscv64.s
new file mode 100644
index 0000000..e099a64
--- /dev/null
+++ b/src/runtime/memmove_riscv64.s
@@ -0,0 +1,319 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// See memmove Go doc for important implementation constraints.
+
+// void runtime·memmove(void*, void*, uintptr)
+TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
+ // X10 = to
+ // X11 = from
+ // X12 = n
+ BEQ X10, X11, done
+ BEQZ X12, done
+
+ // If the destination is ahead of the source, start at the end of the
+ // buffer and go backward.
+ BGTU X10, X11, backward
+
+ // If less than 8 bytes, do single byte copies.
+ MOV $8, X9
+ BLT X12, X9, f_loop4_check
+
+ // Check alignment - if alignment differs we have to do one byte at a time.
+ AND $7, X10, X5
+ AND $7, X11, X6
+ BNE X5, X6, f_loop8_unaligned_check
+ BEQZ X5, f_loop_check
+
+ // Move one byte at a time until we reach 8 byte alignment.
+ SUB X5, X9, X5
+ SUB X5, X12, X12
+f_align:
+ SUB $1, X5
+ MOVB 0(X11), X14
+ MOVB X14, 0(X10)
+ ADD $1, X10
+ ADD $1, X11
+ BNEZ X5, f_align
+
+f_loop_check:
+ MOV $16, X9
+ BLT X12, X9, f_loop8_check
+ MOV $32, X9
+ BLT X12, X9, f_loop16_check
+ MOV $64, X9
+ BLT X12, X9, f_loop32_check
+f_loop64:
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV 16(X11), X16
+ MOV 24(X11), X17
+ MOV 32(X11), X18
+ MOV 40(X11), X19
+ MOV 48(X11), X20
+ MOV 56(X11), X21
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ MOV X16, 16(X10)
+ MOV X17, 24(X10)
+ MOV X18, 32(X10)
+ MOV X19, 40(X10)
+ MOV X20, 48(X10)
+ MOV X21, 56(X10)
+ ADD $64, X10
+ ADD $64, X11
+ SUB $64, X12
+ BGE X12, X9, f_loop64
+ BEQZ X12, done
+
+f_loop32_check:
+ MOV $32, X9
+ BLT X12, X9, f_loop16_check
+f_loop32:
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV 16(X11), X16
+ MOV 24(X11), X17
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ MOV X16, 16(X10)
+ MOV X17, 24(X10)
+ ADD $32, X10
+ ADD $32, X11
+ SUB $32, X12
+ BGE X12, X9, f_loop32
+ BEQZ X12, done
+
+f_loop16_check:
+ MOV $16, X9
+ BLT X12, X9, f_loop8_check
+f_loop16:
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ ADD $16, X10
+ ADD $16, X11
+ SUB $16, X12
+ BGE X12, X9, f_loop16
+ BEQZ X12, done
+
+f_loop8_check:
+ MOV $8, X9
+ BLT X12, X9, f_loop4_check
+f_loop8:
+ MOV 0(X11), X14
+ MOV X14, 0(X10)
+ ADD $8, X10
+ ADD $8, X11
+ SUB $8, X12
+ BGE X12, X9, f_loop8
+ BEQZ X12, done
+ JMP f_loop4_check
+
+f_loop8_unaligned_check:
+ MOV $8, X9
+ BLT X12, X9, f_loop4_check
+f_loop8_unaligned:
+ MOVB 0(X11), X14
+ MOVB 1(X11), X15
+ MOVB 2(X11), X16
+ MOVB 3(X11), X17
+ MOVB 4(X11), X18
+ MOVB 5(X11), X19
+ MOVB 6(X11), X20
+ MOVB 7(X11), X21
+ MOVB X14, 0(X10)
+ MOVB X15, 1(X10)
+ MOVB X16, 2(X10)
+ MOVB X17, 3(X10)
+ MOVB X18, 4(X10)
+ MOVB X19, 5(X10)
+ MOVB X20, 6(X10)
+ MOVB X21, 7(X10)
+ ADD $8, X10
+ ADD $8, X11
+ SUB $8, X12
+ BGE X12, X9, f_loop8_unaligned
+
+f_loop4_check:
+ MOV $4, X9
+ BLT X12, X9, f_loop1
+f_loop4:
+ MOVB 0(X11), X14
+ MOVB 1(X11), X15
+ MOVB 2(X11), X16
+ MOVB 3(X11), X17
+ MOVB X14, 0(X10)
+ MOVB X15, 1(X10)
+ MOVB X16, 2(X10)
+ MOVB X17, 3(X10)
+ ADD $4, X10
+ ADD $4, X11
+ SUB $4, X12
+ BGE X12, X9, f_loop4
+
+f_loop1:
+ BEQZ X12, done
+ MOVB 0(X11), X14
+ MOVB X14, 0(X10)
+ ADD $1, X10
+ ADD $1, X11
+ SUB $1, X12
+ JMP f_loop1
+
+backward:
+ ADD X10, X12, X10
+ ADD X11, X12, X11
+
+ // If less than 8 bytes, do single byte copies.
+ MOV $8, X9
+ BLT X12, X9, b_loop4_check
+
+ // Check alignment - if alignment differs we have to do one byte at a time.
+ AND $7, X10, X5
+ AND $7, X11, X6
+ BNE X5, X6, b_loop8_unaligned_check
+ BEQZ X5, b_loop_check
+
+ // Move one byte at a time until we reach 8 byte alignment.
+ SUB X5, X12, X12
+b_align:
+ SUB $1, X5
+ SUB $1, X10
+ SUB $1, X11
+ MOVB 0(X11), X14
+ MOVB X14, 0(X10)
+ BNEZ X5, b_align
+
+b_loop_check:
+ MOV $16, X9
+ BLT X12, X9, b_loop8_check
+ MOV $32, X9
+ BLT X12, X9, b_loop16_check
+ MOV $64, X9
+ BLT X12, X9, b_loop32_check
+b_loop64:
+ SUB $64, X10
+ SUB $64, X11
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV 16(X11), X16
+ MOV 24(X11), X17
+ MOV 32(X11), X18
+ MOV 40(X11), X19
+ MOV 48(X11), X20
+ MOV 56(X11), X21
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ MOV X16, 16(X10)
+ MOV X17, 24(X10)
+ MOV X18, 32(X10)
+ MOV X19, 40(X10)
+ MOV X20, 48(X10)
+ MOV X21, 56(X10)
+ SUB $64, X12
+ BGE X12, X9, b_loop64
+ BEQZ X12, done
+
+b_loop32_check:
+ MOV $32, X9
+ BLT X12, X9, b_loop16_check
+b_loop32:
+ SUB $32, X10
+ SUB $32, X11
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV 16(X11), X16
+ MOV 24(X11), X17
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ MOV X16, 16(X10)
+ MOV X17, 24(X10)
+ SUB $32, X12
+ BGE X12, X9, b_loop32
+ BEQZ X12, done
+
+b_loop16_check:
+ MOV $16, X9
+ BLT X12, X9, b_loop8_check
+b_loop16:
+ SUB $16, X10
+ SUB $16, X11
+ MOV 0(X11), X14
+ MOV 8(X11), X15
+ MOV X14, 0(X10)
+ MOV X15, 8(X10)
+ SUB $16, X12
+ BGE X12, X9, b_loop16
+ BEQZ X12, done
+
+b_loop8_check:
+ MOV $8, X9
+ BLT X12, X9, b_loop4_check
+b_loop8:
+ SUB $8, X10
+ SUB $8, X11
+ MOV 0(X11), X14
+ MOV X14, 0(X10)
+ SUB $8, X12
+ BGE X12, X9, b_loop8
+ BEQZ X12, done
+ JMP b_loop4_check
+
+b_loop8_unaligned_check:
+ MOV $8, X9
+ BLT X12, X9, b_loop4_check
+b_loop8_unaligned:
+ SUB $8, X10
+ SUB $8, X11
+ MOVB 0(X11), X14
+ MOVB 1(X11), X15
+ MOVB 2(X11), X16
+ MOVB 3(X11), X17
+ MOVB 4(X11), X18
+ MOVB 5(X11), X19
+ MOVB 6(X11), X20
+ MOVB 7(X11), X21
+ MOVB X14, 0(X10)
+ MOVB X15, 1(X10)
+ MOVB X16, 2(X10)
+ MOVB X17, 3(X10)
+ MOVB X18, 4(X10)
+ MOVB X19, 5(X10)
+ MOVB X20, 6(X10)
+ MOVB X21, 7(X10)
+ SUB $8, X12
+ BGE X12, X9, b_loop8_unaligned
+
+b_loop4_check:
+ MOV $4, X9
+ BLT X12, X9, b_loop1
+b_loop4:
+ SUB $4, X10
+ SUB $4, X11
+ MOVB 0(X11), X14
+ MOVB 1(X11), X15
+ MOVB 2(X11), X16
+ MOVB 3(X11), X17
+ MOVB X14, 0(X10)
+ MOVB X15, 1(X10)
+ MOVB X16, 2(X10)
+ MOVB X17, 3(X10)
+ SUB $4, X12
+ BGE X12, X9, b_loop4
+
+b_loop1:
+ BEQZ X12, done
+ SUB $1, X10
+ SUB $1, X11
+ MOVB 0(X11), X14
+ MOVB X14, 0(X10)
+ SUB $1, X12
+ JMP b_loop1
+
+done:
+ RET