summaryrefslogtreecommitdiffstats
path: root/src/runtime/memmove_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime/memmove_arm64.s')
-rw-r--r--src/runtime/memmove_arm64.s238
1 files changed, 238 insertions, 0 deletions
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
new file mode 100644
index 0000000..8ec3ed8
--- /dev/null
+++ b/src/runtime/memmove_arm64.s
@@ -0,0 +1,238 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// See memmove Go doc for important implementation constraints.
+
+// Register map
+//
+// dstin R0
+// src R1
+// count R2
+// dst R3 (same as R0, but gets modified in unaligned cases)
+// srcend R4
+// dstend R5
+// data R6-R17
+// tmp1 R14
+
+// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+// copies of up to 128 bytes, and large copies. The overhead of the overlap
+// check is negligible since it is only required for large copies.
+//
+// Large copies use a software pipelined loop processing 64 bytes per iteration.
+// The destination pointer is 16-byte aligned to minimize unaligned accesses.
+// The loop tail is handled by always copying 64 bytes from the end.
+
+// func memmove(to, from unsafe.Pointer, n uintptr)
+TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
+ CBZ R2, copy0
+
+ // Small copies: 1..16 bytes
+ CMP $16, R2
+ BLE copy16
+
+ // Large copies
+ CMP $128, R2
+ BHI copy_long
+ CMP $32, R2
+ BHI copy32_128
+
+ // Small copies: 17..32 bytes.
+ LDP (R1), (R6, R7)
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ LDP -16(R4), (R12, R13)
+ STP (R6, R7), (R0)
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ STP (R12, R13), -16(R5)
+ RET
+
+// Small copies: 1..16 bytes.
+copy16:
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ CMP $8, R2
+ BLT copy7
+ MOVD (R1), R6
+ MOVD -8(R4), R7
+ MOVD R6, (R0)
+ MOVD R7, -8(R5)
+ RET
+
+copy7:
+ TBZ $2, R2, copy3
+ MOVWU (R1), R6
+ MOVWU -4(R4), R7
+ MOVW R6, (R0)
+ MOVW R7, -4(R5)
+ RET
+
+copy3:
+ TBZ $1, R2, copy1
+ MOVHU (R1), R6
+ MOVHU -2(R4), R7
+ MOVH R6, (R0)
+ MOVH R7, -2(R5)
+ RET
+
+copy1:
+ MOVBU (R1), R6
+ MOVB R6, (R0)
+
+copy0:
+ RET
+
+ // Medium copies: 33..128 bytes.
+copy32_128:
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ LDP (R1), (R6, R7)
+ LDP 16(R1), (R8, R9)
+ LDP -32(R4), (R10, R11)
+ LDP -16(R4), (R12, R13)
+ CMP $64, R2
+ BHI copy128
+ STP (R6, R7), (R0)
+ STP (R8, R9), 16(R0)
+ STP (R10, R11), -32(R5)
+ STP (R12, R13), -16(R5)
+ RET
+
+ // Copy 65..128 bytes.
+copy128:
+ LDP 32(R1), (R14, R15)
+ LDP 48(R1), (R16, R17)
+ CMP $96, R2
+ BLS copy96
+ LDP -64(R4), (R2, R3)
+ LDP -48(R4), (R1, R4)
+ STP (R2, R3), -64(R5)
+ STP (R1, R4), -48(R5)
+
+copy96:
+ STP (R6, R7), (R0)
+ STP (R8, R9), 16(R0)
+ STP (R14, R15), 32(R0)
+ STP (R16, R17), 48(R0)
+ STP (R10, R11), -32(R5)
+ STP (R12, R13), -16(R5)
+ RET
+
+ // Copy more than 128 bytes.
+copy_long:
+ ADD R1, R2, R4 // R4 points just past the last source byte
+ ADD R0, R2, R5 // R5 points just past the last destination byte
+ MOVD ZR, R7
+ MOVD ZR, R8
+
+ CMP $1024, R2
+ BLT backward_check
+ // feature detect to decide how to align
+ MOVBU runtime·arm64UseAlignedLoads(SB), R6
+ CBNZ R6, use_aligned_loads
+ MOVD R0, R7
+ MOVD R5, R8
+ B backward_check
+use_aligned_loads:
+ MOVD R1, R7
+ MOVD R4, R8
+ // R7 and R8 are used here for the realignment calculation. In
+ // the use_aligned_loads case, R7 is the src pointer and R8 is
+ // srcend pointer, which is used in the backward copy case.
+ // When doing aligned stores, R7 is the dst pointer and R8 is
+ // the dstend pointer.
+
+backward_check:
+ // Use backward copy if there is an overlap.
+ SUB R1, R0, R14
+ CBZ R14, copy0
+ CMP R2, R14
+ BCC copy_long_backward
+
+ // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
+ LDP (R1), (R12, R13) // Load A
+ AND $15, R7, R14 // Calculate the realignment offset
+ SUB R14, R1, R1
+ SUB R14, R0, R3 // move dst back same amount as src
+ ADD R14, R2, R2
+ LDP 16(R1), (R6, R7) // Load B
+ STP (R12, R13), (R0) // Store A
+ LDP 32(R1), (R8, R9) // Load C
+ LDP 48(R1), (R10, R11) // Load D
+ LDP.W 64(R1), (R12, R13) // Load E
+ // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
+ SUBS $144, R2, R2
+ BLS copy64_from_end
+
+loop64:
+ STP (R6, R7), 16(R3) // Store B
+ LDP 16(R1), (R6, R7) // Load B (next iteration)
+ STP (R8, R9), 32(R3) // Store C
+ LDP 32(R1), (R8, R9) // Load C
+ STP (R10, R11), 48(R3) // Store D
+ LDP 48(R1), (R10, R11) // Load D
+ STP.W (R12, R13), 64(R3) // Store E
+ LDP.W 64(R1), (R12, R13) // Load E
+ SUBS $64, R2, R2
+ BHI loop64
+
+ // Write the last iteration and copy 64 bytes from the end.
+copy64_from_end:
+ LDP -64(R4), (R14, R15) // Load F
+ STP (R6, R7), 16(R3) // Store B
+ LDP -48(R4), (R6, R7) // Load G
+ STP (R8, R9), 32(R3) // Store C
+ LDP -32(R4), (R8, R9) // Load H
+ STP (R10, R11), 48(R3) // Store D
+ LDP -16(R4), (R10, R11) // Load I
+ STP (R12, R13), 64(R3) // Store E
+ STP (R14, R15), -64(R5) // Store F
+ STP (R6, R7), -48(R5) // Store G
+ STP (R8, R9), -32(R5) // Store H
+ STP (R10, R11), -16(R5) // Store I
+ RET
+
+ // Large backward copy for overlapping copies.
+ // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
+copy_long_backward:
+ LDP -16(R4), (R12, R13)
+ AND $15, R8, R14
+ SUB R14, R4, R4
+ SUB R14, R2, R2
+ LDP -16(R4), (R6, R7)
+ STP (R12, R13), -16(R5)
+ LDP -32(R4), (R8, R9)
+ LDP -48(R4), (R10, R11)
+ LDP.W -64(R4), (R12, R13)
+ SUB R14, R5, R5
+ SUBS $128, R2, R2
+ BLS copy64_from_start
+
+loop64_backward:
+ STP (R6, R7), -16(R5)
+ LDP -16(R4), (R6, R7)
+ STP (R8, R9), -32(R5)
+ LDP -32(R4), (R8, R9)
+ STP (R10, R11), -48(R5)
+ LDP -48(R4), (R10, R11)
+ STP.W (R12, R13), -64(R5)
+ LDP.W -64(R4), (R12, R13)
+ SUBS $64, R2, R2
+ BHI loop64_backward
+
+ // Write the last iteration and copy 64 bytes from the start.
+copy64_from_start:
+ LDP 48(R1), (R2, R3)
+ STP (R6, R7), -16(R5)
+ LDP 32(R1), (R6, R7)
+ STP (R8, R9), -32(R5)
+ LDP 16(R1), (R8, R9)
+ STP (R10, R11), -48(R5)
+ LDP (R1), (R10, R11)
+ STP (R12, R13), -64(R5)
+ STP (R2, R3), 48(R0)
+ STP (R6, R7), 32(R0)
+ STP (R8, R9), 16(R0)
+ STP (R10, R11), (R0)
+ RET