summaryrefslogtreecommitdiffstats
path: root/src/runtime/memmove_ppc64x.s
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:19:13 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-16 19:19:13 +0000
commitccd992355df7192993c666236047820244914598 (patch)
treef00fea65147227b7743083c6148396f74cd66935 /src/runtime/memmove_ppc64x.s
parentInitial commit. (diff)
downloadgolang-1.21-ccd992355df7192993c666236047820244914598.tar.xz
golang-1.21-ccd992355df7192993c666236047820244914598.zip
Adding upstream version 1.21.8.upstream/1.21.8
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/runtime/memmove_ppc64x.s')
-rw-r--r--src/runtime/memmove_ppc64x.s220
1 files changed, 220 insertions, 0 deletions
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
new file mode 100644
index 0000000..18b9c85
--- /dev/null
+++ b/src/runtime/memmove_ppc64x.s
@@ -0,0 +1,220 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "textflag.h"
+
+// See memmove Go doc for important implementation constraints.
+
+// func memmove(to, from unsafe.Pointer, n uintptr)
+
+// target address
+#define TGT R3
+// source address
+#define SRC R4
+// length to move
+#define LEN R5
+// number of doublewords
+#define DWORDS R6
+// number of bytes < 8
+#define BYTES R7
+// const 16 used as index
+#define IDX16 R8
+// temp used for copies, etc.
+#define TMP R9
+// number of 64 byte chunks
+#define QWORDS R10
+// index values
+#define IDX32 R14
+#define IDX48 R15
+#define OCTWORDS R16
+
+TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
+ // R3 = TGT = to
+ // R4 = SRC = from
+ // R5 = LEN = n
+
+ // Determine if there are doublewords to
+ // copy so a more efficient move can be done
+check:
+#ifdef GOPPC64_power10
+ CMP LEN, $16
+ BGT mcopy
+ SLD $56, LEN, TMP
+ LXVL SRC, TMP, V0
+ STXVL V0, TGT, TMP
+ RET
+#endif
+mcopy:
+ ANDCC $7, LEN, BYTES // R7: bytes to copy
+ SRD $3, LEN, DWORDS // R6: double words to copy
+ MOVFL CR0, CR3 // save CR from ANDCC
+ CMP DWORDS, $0, CR1 // CR1[EQ] set if no double words to copy
+
+ // Determine overlap by subtracting dest - src and comparing against the
+ // length. This catches the cases where src and dest are in different types
+ // of storage such as stack and static to avoid doing backward move when not
+ // necessary.
+
+ SUB SRC, TGT, TMP // dest - src
+ CMPU TMP, LEN, CR2 // < len?
+ BC 12, 8, backward // BLT CR2 backward
+
+ // Copying forward if no overlap.
+
+ BC 12, 6, checkbytes // BEQ CR1, checkbytes
+ SRDCC $3, DWORDS, OCTWORDS // 64 byte chunks?
+ MOVD $16, IDX16
+ BEQ lt64gt8 // < 64 bytes
+
+ // Prepare for moves of 64 bytes at a time.
+
+forward64setup:
+ DCBTST (TGT) // prepare data cache
+ DCBT (SRC)
+ MOVD OCTWORDS, CTR // Number of 64 byte chunks
+ MOVD $32, IDX32
+ MOVD $48, IDX48
+ PCALIGN $16
+
+forward64:
+ LXVD2X (R0)(SRC), VS32 // load 64 bytes
+ LXVD2X (IDX16)(SRC), VS33
+ LXVD2X (IDX32)(SRC), VS34
+ LXVD2X (IDX48)(SRC), VS35
+ ADD $64, SRC
+ STXVD2X VS32, (R0)(TGT) // store 64 bytes
+ STXVD2X VS33, (IDX16)(TGT)
+ STXVD2X VS34, (IDX32)(TGT)
+ STXVD2X VS35, (IDX48)(TGT)
+ ADD $64,TGT // bump up for next set
+ BC 16, 0, forward64 // continue
+ ANDCC $7, DWORDS // remaining doublewords
+ BEQ checkbytes // only bytes remain
+
+lt64gt8:
+ CMP DWORDS, $4
+ BLT lt32gt8
+ LXVD2X (R0)(SRC), VS32
+ LXVD2X (IDX16)(SRC), VS33
+ ADD $-4, DWORDS
+ STXVD2X VS32, (R0)(TGT)
+ STXVD2X VS33, (IDX16)(TGT)
+ ADD $32, SRC
+ ADD $32, TGT
+
+lt32gt8:
+ // At this point >= 8 and < 32
+ // Move 16 bytes if possible
+ CMP DWORDS, $2
+ BLT lt16
+ LXVD2X (R0)(SRC), VS32
+ ADD $-2, DWORDS
+ STXVD2X VS32, (R0)(TGT)
+ ADD $16, SRC
+ ADD $16, TGT
+
+lt16: // Move 8 bytes if possible
+ CMP DWORDS, $1
+ BLT checkbytes
+#ifdef GOPPC64_power10
+ ADD $8, BYTES
+ SLD $56, BYTES, TMP
+ LXVL SRC, TMP, V0
+ STXVL V0, TGT, TMP
+ RET
+#endif
+
+ MOVD 0(SRC), TMP
+ ADD $8, SRC
+ MOVD TMP, 0(TGT)
+ ADD $8, TGT
+checkbytes:
+ BC 12, 14, LR // BEQ lr
+#ifdef GOPPC64_power10
+ SLD $56, BYTES, TMP
+ LXVL SRC, TMP, V0
+ STXVL V0, TGT, TMP
+ RET
+#endif
+lt8: // Move word if possible
+ CMP BYTES, $4
+ BLT lt4
+ MOVWZ 0(SRC), TMP
+ ADD $-4, BYTES
+ MOVW TMP, 0(TGT)
+ ADD $4, SRC
+ ADD $4, TGT
+lt4: // Move halfword if possible
+ CMP BYTES, $2
+ BLT lt2
+ MOVHZ 0(SRC), TMP
+ ADD $-2, BYTES
+ MOVH TMP, 0(TGT)
+ ADD $2, SRC
+ ADD $2, TGT
+lt2: // Move last byte if 1 left
+ CMP BYTES, $1
+ BC 12, 0, LR // ble lr
+ MOVBZ 0(SRC), TMP
+ MOVBZ TMP, 0(TGT)
+ RET
+
+backward:
+ // Copying backwards proceeds by copying R7 bytes then copying R6 double words.
+ // R3 and R4 are advanced to the end of the destination/source buffers
+ // respectively and moved back as we copy.
+
+ ADD LEN, SRC, SRC // end of source
+ ADD TGT, LEN, TGT // end of dest
+
+ BEQ nobackwardtail // earlier condition
+
+ MOVD BYTES, CTR // bytes to move
+
+backwardtailloop:
+ MOVBZ -1(SRC), TMP // point to last byte
+ SUB $1,SRC
+ MOVBZ TMP, -1(TGT)
+ SUB $1,TGT
+ BDNZ backwardtailloop
+
+nobackwardtail:
+ BC 4, 5, LR // blelr cr1, return if DWORDS == 0
+ SRDCC $2,DWORDS,QWORDS // Compute number of 32B blocks and compare to 0
+ BNE backward32setup // If QWORDS != 0, start the 32B copy loop.
+
+backward24:
+ // DWORDS is a value between 1-3.
+ CMP DWORDS, $2
+
+ MOVD -8(SRC), TMP
+ MOVD TMP, -8(TGT)
+ BC 12, 0, LR // bltlr, return if DWORDS == 1
+
+ MOVD -16(SRC), TMP
+ MOVD TMP, -16(TGT)
+ BC 12, 2, LR // beqlr, return if DWORDS == 2
+
+ MOVD -24(SRC), TMP
+ MOVD TMP, -24(TGT)
+ RET
+
+backward32setup:
+ ANDCC $3,DWORDS // Compute remaining DWORDS and compare to 0
+ MOVD QWORDS, CTR // set up loop ctr
+ MOVD $16, IDX16 // 32 bytes at a time
+ PCALIGN $16
+
+backward32loop:
+ SUB $32, TGT
+ SUB $32, SRC
+ LXVD2X (R0)(SRC), VS32 // load 16x2 bytes
+ LXVD2X (IDX16)(SRC), VS33
+ STXVD2X VS32, (R0)(TGT) // store 16x2 bytes
+ STXVD2X VS33, (IDX16)(TGT)
+ BDNZ backward32loop
+ BC 12, 2, LR // beqlr, return if DWORDS == 0
+ BR backward24