summaryrefslogtreecommitdiffstats
path: root/src/runtime/memclr_ppc64x.s
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/runtime/memclr_ppc64x.s174
1 files changed, 174 insertions, 0 deletions
diff --git a/src/runtime/memclr_ppc64x.s b/src/runtime/memclr_ppc64x.s
new file mode 100644
index 0000000..3543255
--- /dev/null
+++ b/src/runtime/memclr_ppc64x.s
@@ -0,0 +1,174 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ppc64 || ppc64le
+
+#include "textflag.h"
+
+// See memclrNoHeapPointers Go doc for important implementation constraints.
+
+// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
+TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-16
+ // R3 = ptr
+ // R4 = n
+
+ // Determine if there are doublewords to clear
+check:
+ ANDCC $7, R4, R5 // R5: leftover bytes to clear
+ SRD $3, R4, R6 // R6: double words to clear
+ CMP R6, $0, CR1 // CR1[EQ] set if no double words
+
+ BC 12, 6, nozerolarge // only single bytes
+ CMP R4, $512
+ BLT under512 // special case for < 512
+ ANDCC $127, R3, R8 // check for 128 alignment of address
+ BEQ zero512setup
+
+ ANDCC $7, R3, R15
+ BEQ zero512xsetup // at least 8 byte aligned
+
+ // zero bytes up to 8 byte alignment
+
+ ANDCC $1, R3, R15 // check for byte alignment
+ BEQ byte2
+ MOVB R0, 0(R3) // zero 1 byte
+ ADD $1, R3 // bump ptr by 1
+ ADD $-1, R4
+
+byte2:
+ ANDCC $2, R3, R15 // check for 2 byte alignment
+ BEQ byte4
+ MOVH R0, 0(R3) // zero 2 bytes
+ ADD $2, R3 // bump ptr by 2
+ ADD $-2, R4
+
+byte4:
+ ANDCC $4, R3, R15 // check for 4 byte alignment
+ BEQ zero512xsetup
+ MOVW R0, 0(R3) // zero 4 bytes
+ ADD $4, R3 // bump ptr by 4
+ ADD $-4, R4
+ BR zero512xsetup // ptr should now be 8 byte aligned
+
+under512:
+ SRDCC $3, R6, R7 // 64 byte chunks?
+ XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
+ BEQ lt64gt8
+
+ // Prepare to clear 64 bytes at a time.
+
+zero64setup:
+ DCBTST (R3) // prepare data cache
+ MOVD R7, CTR // number of 64 byte chunks
+ MOVD $16, R8
+ MOVD $32, R16
+ MOVD $48, R17
+
+zero64:
+ STXVD2X VS32, (R3+R0) // store 16 bytes
+ STXVD2X VS32, (R3+R8)
+ STXVD2X VS32, (R3+R16)
+ STXVD2X VS32, (R3+R17)
+ ADD $64, R3
+ ADD $-64, R4
+ BDNZ zero64 // dec ctr, br zero64 if ctr not 0
+ SRDCC $3, R4, R6 // remaining doublewords
+ BEQ nozerolarge
+
+lt64gt8:
+ CMP R4, $32
+ BLT lt32gt8
+ MOVD $16, R8
+ STXVD2X VS32, (R3+R0)
+ STXVD2X VS32, (R3+R8)
+ ADD $-32, R4
+ ADD $32, R3
+lt32gt8:
+ CMP R4, $16
+ BLT lt16gt8
+ STXVD2X VS32, (R3+R0)
+ ADD $16, R3
+ ADD $-16, R4
+lt16gt8:
+ CMP R4, $8
+ BLT nozerolarge
+ MOVD R0, 0(R3)
+ ADD $8, R3
+ ADD $-8, R4
+
+nozerolarge:
+ ANDCC $7, R4, R5 // any remaining bytes
+ BC 4, 1, LR // ble lr
+
+zerotail:
+ MOVD R5, CTR // set up to clear tail bytes
+
+zerotailloop:
+ MOVB R0, 0(R3) // clear single bytes
+ ADD $1, R3
+ BDNZ zerotailloop // dec ctr, br zerotailloop if ctr not 0
+ RET
+
+zero512xsetup: // 512 chunk with extra needed
+ ANDCC $8, R3, R11 // 8 byte alignment?
+ BEQ zero512setup16
+ MOVD R0, 0(R3) // clear 8 bytes
+ ADD $8, R3 // update ptr to next 8
+ ADD $-8, R4 // dec count by 8
+
+zero512setup16:
+ ANDCC $127, R3, R14 // < 128 byte alignment
+ BEQ zero512setup // handle 128 byte alignment
+ MOVD $128, R15
+ SUB R14, R15, R14 // find increment to 128 alignment
+ SRD $4, R14, R15 // number of 16 byte chunks
+
+zero512presetup:
+ MOVD R15, CTR // loop counter of 16 bytes
+ XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
+
+zero512preloop: // clear up to 128 alignment
+ STXVD2X VS32, (R3+R0) // clear 16 bytes
+ ADD $16, R3 // update ptr
+ ADD $-16, R4 // dec count
+ BDNZ zero512preloop
+
+zero512setup: // setup for dcbz loop
+ CMP R4, $512 // check if at least 512
+ BLT remain
+ SRD $9, R4, R8 // loop count for 512 chunks
+ MOVD R8, CTR // set up counter
+ MOVD $128, R9 // index regs for 128 bytes
+ MOVD $256, R10
+ MOVD $384, R11
+ PCALIGN $32
+
+zero512:
+ DCBZ (R3+R0) // clear first chunk
+ DCBZ (R3+R9) // clear second chunk
+ DCBZ (R3+R10) // clear third chunk
+ DCBZ (R3+R11) // clear fourth chunk
+ ADD $512, R3
+ BDNZ zero512
+ ANDCC $511, R4
+
+remain:
+ CMP R4, $128 // check if 128 byte chunks left
+ BLT smaller
+ DCBZ (R3+R0) // clear 128
+ ADD $128, R3
+ ADD $-128, R4
+ BR remain
+
+smaller:
+ ANDCC $127, R4, R7 // find leftovers
+ BEQ done
+ CMP R7, $64 // more than 64, do 64 at a time
+ XXLXOR VS32, VS32, VS32
+ BLT lt64gt8 // less than 64
+ SRD $6, R7, R7 // set up counter for 64
+ BR zero64setup
+
+done:
+ RET