summaryrefslogtreecommitdiffstats
path: root/src/math/big/arith_ppc64x.s
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 13:14:23 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 13:14:23 +0000
commit73df946d56c74384511a194dd01dbe099584fd1a (patch)
treefd0bcea490dd81327ddfbb31e215439672c9a068 /src/math/big/arith_ppc64x.s
parentInitial commit. (diff)
downloadgolang-1.16-73df946d56c74384511a194dd01dbe099584fd1a.tar.xz
golang-1.16-73df946d56c74384511a194dd01dbe099584fd1a.zip
Adding upstream version 1.16.10.upstream/1.16.10upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/math/big/arith_ppc64x.s')
-rw-r--r--src/math/big/arith_ppc64x.s481
1 files changed, 481 insertions, 0 deletions
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
new file mode 100644
index 0000000..b299ccc
--- /dev/null
+++ b/src/math/big/arith_ppc64x.s
@@ -0,0 +1,481 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !math_big_pure_go,ppc64 !math_big_pure_go,ppc64le
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+// func mulWW(x, y Word) (z1, z0 Word)
+TEXT ·mulWW(SB), NOSPLIT, $0
+ MOVD x+0(FP), R4
+ MOVD y+8(FP), R5
+ MULHDU R4, R5, R6
+ MULLD R4, R5, R7
+ MOVD R6, z1+16(FP)
+ MOVD R7, z0+24(FP)
+ RET
+
+// func addVV(z, y, y []Word) (c Word)
+// z[i] = x[i] + y[i] for all i, carrying
+TEXT ·addVV(SB), NOSPLIT, $0
+ MOVD z_len+8(FP), R7 // R7 = z_len
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y[]
+ MOVD z+0(FP), R10 // R10 = z[]
+
+ // If z_len = 0, we are done
+ CMP R0, R7
+ MOVD R0, R4
+ BEQ done
+
+ // Process the first iteration out of the loop so we can
+ // use MOVDU and avoid 3 index registers updates.
+ MOVD 0(R8), R11 // R11 = x[i]
+ MOVD 0(R9), R12 // R12 = y[i]
+ ADD $-1, R7 // R7 = z_len - 1
+ ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
+ CMP R0, R7
+ MOVD R15, 0(R10) // z[i]
+ BEQ final // If z_len was 1, we are done
+
+ SRD $2, R7, R5 // R5 = z_len/4
+ CMP R0, R5
+ MOVD R5, CTR // Set up loop counter
+ BEQ tail // If R5 = 0, we can't use the loop
+
+ // Process 4 elements per iteration. Unrolling this loop
+ // means a performance trade-off: we will lose performance
+ // for small values of z_len (0.90x in the worst case), but
+ // gain significant performance as z_len increases (up to
+ // 1.45x).
+loop:
+ MOVD 8(R8), R11 // R11 = x[i]
+ MOVD 16(R8), R12 // R12 = x[i+1]
+ MOVD 24(R8), R14 // R14 = x[i+2]
+ MOVDU 32(R8), R15 // R15 = x[i+3]
+ MOVD 8(R9), R16 // R16 = y[i]
+ MOVD 16(R9), R17 // R17 = y[i+1]
+ MOVD 24(R9), R18 // R18 = y[i+2]
+ MOVDU 32(R9), R19 // R19 = y[i+3]
+ ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
+ ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
+ ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
+ ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
+ MOVD R20, 8(R10) // z[i]
+ MOVD R21, 16(R10) // z[i+1]
+ MOVD R22, 24(R10) // z[i+2]
+ MOVDU R23, 32(R10) // z[i+3]
+ ADD $-4, R7 // R7 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have more elements to read
+ CMP R0, R7
+ BEQ final
+
+ // Process the remaining elements, one at a time
+tail:
+ MOVDU 8(R8), R11 // R11 = x[i]
+ MOVDU 8(R9), R16 // R16 = y[i]
+ ADD $-1, R7 // R7 = z_len - 1
+ ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
+ CMP R0, R7
+ MOVDU R20, 8(R10) // z[i]
+ BEQ final // If R7 = 0, we are done
+
+ MOVDU 8(R8), R11
+ MOVDU 8(R9), R16
+ ADD $-1, R7
+ ADDE R11, R16, R20
+ CMP R0, R7
+ MOVDU R20, 8(R10)
+ BEQ final
+
+ MOVD 8(R8), R11
+ MOVD 8(R9), R16
+ ADDE R11, R16, R20
+ MOVD R20, 8(R10)
+
+final:
+ ADDZE R4 // Capture CA
+
+done:
+ MOVD R4, c+72(FP)
+ RET
+
+// func subVV(z, x, y []Word) (c Word)
+// z[i] = x[i] - y[i] for all i, carrying
+TEXT ·subVV(SB), NOSPLIT, $0
+ MOVD z_len+8(FP), R7 // R7 = z_len
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y[]
+ MOVD z+0(FP), R10 // R10 = z[]
+
+ // If z_len = 0, we are done
+ CMP R0, R7
+ MOVD R0, R4
+ BEQ done
+
+ // Process the first iteration out of the loop so we can
+ // use MOVDU and avoid 3 index registers updates.
+ MOVD 0(R8), R11 // R11 = x[i]
+ MOVD 0(R9), R12 // R12 = y[i]
+ ADD $-1, R7 // R7 = z_len - 1
+ SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
+ CMP R0, R7
+ MOVD R15, 0(R10) // z[i]
+ BEQ final // If z_len was 1, we are done
+
+ SRD $2, R7, R5 // R5 = z_len/4
+ CMP R0, R5
+ MOVD R5, CTR // Set up loop counter
+ BEQ tail // If R5 = 0, we can't use the loop
+
+ // Process 4 elements per iteration. Unrolling this loop
+ // means a performance trade-off: we will lose performance
+ // for small values of z_len (0.92x in the worst case), but
+ // gain significant performance as z_len increases (up to
+ // 1.45x).
+loop:
+ MOVD 8(R8), R11 // R11 = x[i]
+ MOVD 16(R8), R12 // R12 = x[i+1]
+ MOVD 24(R8), R14 // R14 = x[i+2]
+ MOVDU 32(R8), R15 // R15 = x[i+3]
+ MOVD 8(R9), R16 // R16 = y[i]
+ MOVD 16(R9), R17 // R17 = y[i+1]
+ MOVD 24(R9), R18 // R18 = y[i+2]
+ MOVDU 32(R9), R19 // R19 = y[i+3]
+ SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
+ SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
+ SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
+ SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
+ MOVD R20, 8(R10) // z[i]
+ MOVD R21, 16(R10) // z[i+1]
+ MOVD R22, 24(R10) // z[i+2]
+ MOVDU R23, 32(R10) // z[i+3]
+ ADD $-4, R7 // R7 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have more elements to read
+ CMP R0, R7
+ BEQ final
+
+ // Process the remaining elements, one at a time
+tail:
+ MOVDU 8(R8), R11 // R11 = x[i]
+ MOVDU 8(R9), R16 // R16 = y[i]
+ ADD $-1, R7 // R7 = z_len - 1
+ SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
+ CMP R0, R7
+ MOVDU R20, 8(R10) // z[i]
+ BEQ final // If R7 = 0, we are done
+
+ MOVDU 8(R8), R11
+ MOVDU 8(R9), R16
+ ADD $-1, R7
+ SUBE R16, R11, R20
+ CMP R0, R7
+ MOVDU R20, 8(R10)
+ BEQ final
+
+ MOVD 8(R8), R11
+ MOVD 8(R9), R16
+ SUBE R16, R11, R20
+ MOVD R20, 8(R10)
+
+final:
+ ADDZE R4
+ XOR $1, R4
+
+done:
+ MOVD R4, c+72(FP)
+ RET
+
+// func addVW(z, x []Word, y Word) (c Word)
+TEXT ·addVW(SB), NOSPLIT, $0
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R4 // R4 = y = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
+
+ CMP R0, R11 // If z_len is zero, return
+ BEQ done
+
+ // We will process the first iteration out of the loop so we capture
+ // the value of c. In the subsequent iterations, we will rely on the
+ // value of CA set here.
+ MOVD 0(R8), R20 // R20 = x[i]
+ ADD $-1, R11 // R11 = z_len - 1
+ ADDC R20, R4, R6 // R6 = x[i] + c
+ CMP R0, R11 // If z_len was 1, we are done
+ MOVD R6, 0(R10) // z[i]
+ BEQ final
+
+ // We will read 4 elements per iteration
+ SRD $2, R11, R9 // R9 = z_len/4
+ DCBT (R8)
+ CMP R0, R9
+ MOVD R9, CTR // Set up the loop counter
+ BEQ tail // If R9 = 0, we can't use the loop
+
+loop:
+ MOVD 8(R8), R20 // R20 = x[i]
+ MOVD 16(R8), R21 // R21 = x[i+1]
+ MOVD 24(R8), R22 // R22 = x[i+2]
+ MOVDU 32(R8), R23 // R23 = x[i+3]
+ ADDZE R20, R24 // R24 = x[i] + CA
+ ADDZE R21, R25 // R25 = x[i+1] + CA
+ ADDZE R22, R26 // R26 = x[i+2] + CA
+ ADDZE R23, R27 // R27 = x[i+3] + CA
+ MOVD R24, 8(R10) // z[i]
+ MOVD R25, 16(R10) // z[i+1]
+ MOVD R26, 24(R10) // z[i+2]
+ MOVDU R27, 32(R10) // z[i+3]
+ ADD $-4, R11 // R11 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have some elements to read
+ CMP R0, R11
+ BEQ final
+
+tail:
+ MOVDU 8(R8), R20
+ ADDZE R20, R24
+ ADD $-1, R11
+ MOVDU R24, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVDU 8(R8), R20
+ ADDZE R20, R24
+ ADD $-1, R11
+ MOVDU R24, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVD 8(R8), R20
+ ADDZE R20, R24
+ MOVD R24, 8(R10)
+
+final:
+ ADDZE R0, R4 // c = CA
+done:
+ MOVD R4, c+56(FP)
+ RET
+
+// func subVW(z, x []Word, y Word) (c Word)
+TEXT ·subVW(SB), NOSPLIT, $0
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R4 // R4 = y = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
+
+ CMP R0, R11 // If z_len is zero, return
+ BEQ done
+
+ // We will process the first iteration out of the loop so we capture
+ // the value of c. In the subsequent iterations, we will rely on the
+ // value of CA set here.
+ MOVD 0(R8), R20 // R20 = x[i]
+ ADD $-1, R11 // R11 = z_len - 1
+ SUBC R4, R20, R6 // R6 = x[i] - c
+ CMP R0, R11 // If z_len was 1, we are done
+ MOVD R6, 0(R10) // z[i]
+ BEQ final
+
+ // We will read 4 elements per iteration
+ SRD $2, R11, R9 // R9 = z_len/4
+ DCBT (R8)
+ CMP R0, R9
+ MOVD R9, CTR // Set up the loop counter
+ BEQ tail // If R9 = 0, we can't use the loop
+
+ // The loop here is almost the same as the one used in s390x, but
+ // we don't need to capture CA every iteration because we've already
+ // done that above.
+loop:
+ MOVD 8(R8), R20
+ MOVD 16(R8), R21
+ MOVD 24(R8), R22
+ MOVDU 32(R8), R23
+ SUBE R0, R20
+ SUBE R0, R21
+ SUBE R0, R22
+ SUBE R0, R23
+ MOVD R20, 8(R10)
+ MOVD R21, 16(R10)
+ MOVD R22, 24(R10)
+ MOVDU R23, 32(R10)
+ ADD $-4, R11
+ BC 16, 0, loop // bdnz
+
+ // We may have some elements to read
+ CMP R0, R11
+ BEQ final
+
+tail:
+ MOVDU 8(R8), R20
+ SUBE R0, R20
+ ADD $-1, R11
+ MOVDU R20, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVDU 8(R8), R20
+ SUBE R0, R20
+ ADD $-1, R11
+ MOVDU R20, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVD 8(R8), R20
+ SUBE R0, R20
+ MOVD R20, 8(R10)
+
+final:
+ // Capture CA
+ SUBE R4, R4
+ NEG R4, R4
+
+done:
+ MOVD R4, c+56(FP)
+ RET
+
+TEXT ·shlVU(SB), NOSPLIT, $0
+ BR ·shlVU_g(SB)
+
+TEXT ·shrVU(SB), NOSPLIT, $0
+ BR ·shrVU_g(SB)
+
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
+TEXT ·mulAddVWW(SB), NOSPLIT, $0
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y
+ MOVD r+56(FP), R4 // R4 = r = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
+
+ CMP R0, R11
+ BEQ done
+
+ MOVD 0(R8), R20
+ ADD $-1, R11
+ MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
+ MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
+ ADDC R4, R6 // R6 = z0 + r
+ ADDZE R7 // R7 = z1 + CA
+ CMP R0, R11
+ MOVD R7, R4 // R4 = c
+ MOVD R6, 0(R10) // z[i]
+ BEQ done
+
+ // We will read 4 elements per iteration
+ SRD $2, R11, R14 // R14 = z_len/4
+ DCBT (R8)
+ CMP R0, R14
+ MOVD R14, CTR // Set up the loop counter
+ BEQ tail // If R9 = 0, we can't use the loop
+
+loop:
+ MOVD 8(R8), R20 // R20 = x[i]
+ MOVD 16(R8), R21 // R21 = x[i+1]
+ MOVD 24(R8), R22 // R22 = x[i+2]
+ MOVDU 32(R8), R23 // R23 = x[i+3]
+ MULLD R9, R20, R24 // R24 = z0[i]
+ MULHDU R9, R20, R20 // R20 = z1[i]
+ ADDC R4, R24 // R24 = z0[i] + c
+ ADDZE R20 // R7 = z1[i] + CA
+ MULLD R9, R21, R25
+ MULHDU R9, R21, R21
+ ADDC R20, R25
+ ADDZE R21
+ MULLD R9, R22, R26
+ MULHDU R9, R22, R22
+ MULLD R9, R23, R27
+ MULHDU R9, R23, R23
+ ADDC R21, R26
+ ADDZE R22
+ MOVD R24, 8(R10) // z[i]
+ MOVD R25, 16(R10) // z[i+1]
+ ADDC R22, R27
+ ADDZE R23,R4 // update carry
+ MOVD R26, 24(R10) // z[i+2]
+ MOVDU R27, 32(R10) // z[i+3]
+ ADD $-4, R11 // R11 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have some elements to read
+ CMP R0, R11
+ BEQ done
+
+ // Process the remaining elements, one at a time
+tail:
+ MOVDU 8(R8), R20 // R20 = x[i]
+ MULLD R9, R20, R24 // R24 = z0[i]
+ MULHDU R9, R20, R25 // R25 = z1[i]
+ ADD $-1, R11 // R11 = z_len - 1
+ ADDC R4, R24
+ ADDZE R25
+ MOVDU R24, 8(R10) // z[i]
+ CMP R0, R11
+ MOVD R25, R4 // R4 = c
+ BEQ done // If R11 = 0, we are done
+
+ MOVDU 8(R8), R20
+ MULLD R9, R20, R24
+ MULHDU R9, R20, R25
+ ADD $-1, R11
+ ADDC R4, R24
+ ADDZE R25
+ MOVDU R24, 8(R10)
+ CMP R0, R11
+ MOVD R25, R4
+ BEQ done
+
+ MOVD 8(R8), R20
+ MULLD R9, R20, R24
+ MULHDU R9, R20, R25
+ ADD $-1, R11
+ ADDC R4, R24
+ ADDZE R25
+ MOVD R24, 8(R10)
+ MOVD R25, R4
+
+done:
+ MOVD R4, c+64(FP)
+ RET
+
+// func addMulVVW(z, x []Word, y Word) (c Word)
+TEXT ·addMulVVW(SB), NOSPLIT, $0
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y
+ MOVD z_len+8(FP), R22 // R22 = z_len
+
+ MOVD R0, R3 // R3 will be the index register
+ CMP R0, R22
+ MOVD R0, R4 // R4 = c = 0
+ MOVD R22, CTR // Initialize loop counter
+ BEQ done
+
+loop:
+ MOVD (R8)(R3), R20 // Load x[i]
+ MOVD (R10)(R3), R21 // Load z[i]
+ MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
+ MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
+ ADDC R21, R6 // R6 = z0
+ ADDZE R7 // R7 = z1
+ ADDC R4, R6 // R6 = z0 + c + 0
+ ADDZE R7, R4 // c += z1
+ MOVD R6, (R10)(R3) // Store z[i]
+ ADD $8, R3
+ BC 16, 0, loop // bdnz
+
+done:
+ MOVD R4, c+56(FP)
+ RET
+
+