summaryrefslogtreecommitdiffstats
path: root/src/math/big/arith_ppc64x.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/math/big/arith_ppc64x.s')
-rw-r--r--src/math/big/arith_ppc64x.s633
1 files changed, 633 insertions, 0 deletions
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
new file mode 100644
index 0000000..5fdbf40
--- /dev/null
+++ b/src/math/big/arith_ppc64x.s
@@ -0,0 +1,633 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !math_big_pure_go && (ppc64 || ppc64le)
+// +build !math_big_pure_go
+// +build ppc64 ppc64le
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+// func addVV(z, y, y []Word) (c Word)
+// z[i] = x[i] + y[i] for all i, carrying
+TEXT ·addVV(SB), NOSPLIT, $0
+ MOVD z_len+8(FP), R7 // R7 = z_len
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y[]
+ MOVD z+0(FP), R10 // R10 = z[]
+
+ // If z_len = 0, we are done
+ CMP R0, R7
+ MOVD R0, R4
+ BEQ done
+
+ // Process the first iteration out of the loop so we can
+ // use MOVDU and avoid 3 index registers updates.
+ MOVD 0(R8), R11 // R11 = x[i]
+ MOVD 0(R9), R12 // R12 = y[i]
+ ADD $-1, R7 // R7 = z_len - 1
+ ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
+ CMP R0, R7
+ MOVD R15, 0(R10) // z[i]
+ BEQ final // If z_len was 1, we are done
+
+ SRD $2, R7, R5 // R5 = z_len/4
+ CMP R0, R5
+ MOVD R5, CTR // Set up loop counter
+ BEQ tail // If R5 = 0, we can't use the loop
+
+ // Process 4 elements per iteration. Unrolling this loop
+ // means a performance trade-off: we will lose performance
+ // for small values of z_len (0.90x in the worst case), but
+ // gain significant performance as z_len increases (up to
+ // 1.45x).
+
+ PCALIGN $32
+loop:
+ MOVD 8(R8), R11 // R11 = x[i]
+ MOVD 16(R8), R12 // R12 = x[i+1]
+ MOVD 24(R8), R14 // R14 = x[i+2]
+ MOVDU 32(R8), R15 // R15 = x[i+3]
+ MOVD 8(R9), R16 // R16 = y[i]
+ MOVD 16(R9), R17 // R17 = y[i+1]
+ MOVD 24(R9), R18 // R18 = y[i+2]
+ MOVDU 32(R9), R19 // R19 = y[i+3]
+ ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
+ ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
+ ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
+ ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
+ MOVD R20, 8(R10) // z[i]
+ MOVD R21, 16(R10) // z[i+1]
+ MOVD R22, 24(R10) // z[i+2]
+ MOVDU R23, 32(R10) // z[i+3]
+ ADD $-4, R7 // R7 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have more elements to read
+ CMP R0, R7
+ BEQ final
+
+ // Process the remaining elements, one at a time
+tail:
+ MOVDU 8(R8), R11 // R11 = x[i]
+ MOVDU 8(R9), R16 // R16 = y[i]
+ ADD $-1, R7 // R7 = z_len - 1
+ ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
+ CMP R0, R7
+ MOVDU R20, 8(R10) // z[i]
+ BEQ final // If R7 = 0, we are done
+
+ MOVDU 8(R8), R11
+ MOVDU 8(R9), R16
+ ADD $-1, R7
+ ADDE R11, R16, R20
+ CMP R0, R7
+ MOVDU R20, 8(R10)
+ BEQ final
+
+ MOVD 8(R8), R11
+ MOVD 8(R9), R16
+ ADDE R11, R16, R20
+ MOVD R20, 8(R10)
+
+final:
+ ADDZE R4 // Capture CA
+
+done:
+ MOVD R4, c+72(FP)
+ RET
+
+// func subVV(z, x, y []Word) (c Word)
+// z[i] = x[i] - y[i] for all i, carrying
+TEXT ·subVV(SB), NOSPLIT, $0
+ MOVD z_len+8(FP), R7 // R7 = z_len
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y[]
+ MOVD z+0(FP), R10 // R10 = z[]
+
+ // If z_len = 0, we are done
+ CMP R0, R7
+ MOVD R0, R4
+ BEQ done
+
+ // Process the first iteration out of the loop so we can
+ // use MOVDU and avoid 3 index registers updates.
+ MOVD 0(R8), R11 // R11 = x[i]
+ MOVD 0(R9), R12 // R12 = y[i]
+ ADD $-1, R7 // R7 = z_len - 1
+ SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
+ CMP R0, R7
+ MOVD R15, 0(R10) // z[i]
+ BEQ final // If z_len was 1, we are done
+
+ SRD $2, R7, R5 // R5 = z_len/4
+ CMP R0, R5
+ MOVD R5, CTR // Set up loop counter
+ BEQ tail // If R5 = 0, we can't use the loop
+
+ // Process 4 elements per iteration. Unrolling this loop
+ // means a performance trade-off: we will lose performance
+ // for small values of z_len (0.92x in the worst case), but
+ // gain significant performance as z_len increases (up to
+ // 1.45x).
+
+ PCALIGN $32
+loop:
+ MOVD 8(R8), R11 // R11 = x[i]
+ MOVD 16(R8), R12 // R12 = x[i+1]
+ MOVD 24(R8), R14 // R14 = x[i+2]
+ MOVDU 32(R8), R15 // R15 = x[i+3]
+ MOVD 8(R9), R16 // R16 = y[i]
+ MOVD 16(R9), R17 // R17 = y[i+1]
+ MOVD 24(R9), R18 // R18 = y[i+2]
+ MOVDU 32(R9), R19 // R19 = y[i+3]
+ SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
+ SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
+ SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
+ SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
+ MOVD R20, 8(R10) // z[i]
+ MOVD R21, 16(R10) // z[i+1]
+ MOVD R22, 24(R10) // z[i+2]
+ MOVDU R23, 32(R10) // z[i+3]
+ ADD $-4, R7 // R7 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have more elements to read
+ CMP R0, R7
+ BEQ final
+
+ // Process the remaining elements, one at a time
+tail:
+ MOVDU 8(R8), R11 // R11 = x[i]
+ MOVDU 8(R9), R16 // R16 = y[i]
+ ADD $-1, R7 // R7 = z_len - 1
+ SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
+ CMP R0, R7
+ MOVDU R20, 8(R10) // z[i]
+ BEQ final // If R7 = 0, we are done
+
+ MOVDU 8(R8), R11
+ MOVDU 8(R9), R16
+ ADD $-1, R7
+ SUBE R16, R11, R20
+ CMP R0, R7
+ MOVDU R20, 8(R10)
+ BEQ final
+
+ MOVD 8(R8), R11
+ MOVD 8(R9), R16
+ SUBE R16, R11, R20
+ MOVD R20, 8(R10)
+
+final:
+ ADDZE R4
+ XOR $1, R4
+
+done:
+ MOVD R4, c+72(FP)
+ RET
+
+// func addVW(z, x []Word, y Word) (c Word)
+TEXT ·addVW(SB), NOSPLIT, $0
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R4 // R4 = y = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
+
+ CMP R0, R11 // If z_len is zero, return
+ BEQ done
+
+ // We will process the first iteration out of the loop so we capture
+ // the value of c. In the subsequent iterations, we will rely on the
+ // value of CA set here.
+ MOVD 0(R8), R20 // R20 = x[i]
+ ADD $-1, R11 // R11 = z_len - 1
+ ADDC R20, R4, R6 // R6 = x[i] + c
+ CMP R0, R11 // If z_len was 1, we are done
+ MOVD R6, 0(R10) // z[i]
+ BEQ final
+
+ // We will read 4 elements per iteration
+ SRD $2, R11, R9 // R9 = z_len/4
+ DCBT (R8)
+ CMP R0, R9
+ MOVD R9, CTR // Set up the loop counter
+ BEQ tail // If R9 = 0, we can't use the loop
+ PCALIGN $32
+
+loop:
+ MOVD 8(R8), R20 // R20 = x[i]
+ MOVD 16(R8), R21 // R21 = x[i+1]
+ MOVD 24(R8), R22 // R22 = x[i+2]
+ MOVDU 32(R8), R23 // R23 = x[i+3]
+ ADDZE R20, R24 // R24 = x[i] + CA
+ ADDZE R21, R25 // R25 = x[i+1] + CA
+ ADDZE R22, R26 // R26 = x[i+2] + CA
+ ADDZE R23, R27 // R27 = x[i+3] + CA
+ MOVD R24, 8(R10) // z[i]
+ MOVD R25, 16(R10) // z[i+1]
+ MOVD R26, 24(R10) // z[i+2]
+ MOVDU R27, 32(R10) // z[i+3]
+ ADD $-4, R11 // R11 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have some elements to read
+ CMP R0, R11
+ BEQ final
+
+tail:
+ MOVDU 8(R8), R20
+ ADDZE R20, R24
+ ADD $-1, R11
+ MOVDU R24, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVDU 8(R8), R20
+ ADDZE R20, R24
+ ADD $-1, R11
+ MOVDU R24, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVD 8(R8), R20
+ ADDZE R20, R24
+ MOVD R24, 8(R10)
+
+final:
+ ADDZE R0, R4 // c = CA
+done:
+ MOVD R4, c+56(FP)
+ RET
+
+// func subVW(z, x []Word, y Word) (c Word)
+TEXT ·subVW(SB), NOSPLIT, $0
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R4 // R4 = y = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
+
+ CMP R0, R11 // If z_len is zero, return
+ BEQ done
+
+ // We will process the first iteration out of the loop so we capture
+ // the value of c. In the subsequent iterations, we will rely on the
+ // value of CA set here.
+ MOVD 0(R8), R20 // R20 = x[i]
+ ADD $-1, R11 // R11 = z_len - 1
+ SUBC R4, R20, R6 // R6 = x[i] - c
+ CMP R0, R11 // If z_len was 1, we are done
+ MOVD R6, 0(R10) // z[i]
+ BEQ final
+
+ // We will read 4 elements per iteration
+ SRD $2, R11, R9 // R9 = z_len/4
+ DCBT (R8)
+ CMP R0, R9
+ MOVD R9, CTR // Set up the loop counter
+ BEQ tail // If R9 = 0, we can't use the loop
+
+ // The loop here is almost the same as the one used in s390x, but
+ // we don't need to capture CA every iteration because we've already
+ // done that above.
+
+ PCALIGN $32
+loop:
+ MOVD 8(R8), R20
+ MOVD 16(R8), R21
+ MOVD 24(R8), R22
+ MOVDU 32(R8), R23
+ SUBE R0, R20
+ SUBE R0, R21
+ SUBE R0, R22
+ SUBE R0, R23
+ MOVD R20, 8(R10)
+ MOVD R21, 16(R10)
+ MOVD R22, 24(R10)
+ MOVDU R23, 32(R10)
+ ADD $-4, R11
+ BC 16, 0, loop // bdnz
+
+ // We may have some elements to read
+ CMP R0, R11
+ BEQ final
+
+tail:
+ MOVDU 8(R8), R20
+ SUBE R0, R20
+ ADD $-1, R11
+ MOVDU R20, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVDU 8(R8), R20
+ SUBE R0, R20
+ ADD $-1, R11
+ MOVDU R20, 8(R10)
+ CMP R0, R11
+ BEQ final
+
+ MOVD 8(R8), R20
+ SUBE R0, R20
+ MOVD R20, 8(R10)
+
+final:
+ // Capture CA
+ SUBE R4, R4
+ NEG R4, R4
+
+done:
+ MOVD R4, c+56(FP)
+ RET
+
+//func shlVU(z, x []Word, s uint) (c Word)
+TEXT ·shlVU(SB), NOSPLIT, $0
+ MOVD z+0(FP), R3
+ MOVD x+24(FP), R6
+ MOVD s+48(FP), R9
+ MOVD z_len+8(FP), R4
+ MOVD x_len+32(FP), R7
+ CMP R9, R0 // s==0 copy(z,x)
+ BEQ zeroshift
+ CMP R4, R0 // len(z)==0 return
+ BEQ done
+
+ ADD $-1, R4, R5 // len(z)-1
+ SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
+ SLD $3, R5, R7
+ ADD R6, R7, R15 // save starting address &x[len(z)-1]
+ ADD R3, R7, R16 // save starting address &z[len(z)-1]
+ MOVD (R6)(R7), R14
+ SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7
+ CMP R5, R0 // iterate from i=len(z)-1 to 0
+ BEQ loopexit // Already at end?
+ MOVD 0(R15),R10 // x[i]
+ PCALIGN $32
+shloop:
+ SLD R9, R10, R10 // x[i]<<s
+ MOVDU -8(R15), R14
+ SRD R4, R14, R11 // x[i-1]>>ŝ
+ OR R11, R10, R10
+ MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ
+ MOVD R14, R10 // reuse x[i-1] for next iteration
+ ADD $-8, R16 // i--
+ CMP R15, R6 // &x[i-1]>&x[0]?
+ BGT shloop
+loopexit:
+ MOVD 0(R6), R4
+ SLD R9, R4, R4
+ MOVD R4, 0(R3) // z[0]=x[0]<<s
+ MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c
+ RET
+
+zeroshift:
+ CMP R6, R0 // x is null, nothing to copy
+ BEQ done
+ CMP R6, R3 // if x is same as z, nothing to copy
+ BEQ done
+ CMP R7, R4
+ ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z
+ SLD $3, R7, R7
+ SUB R6, R3, R11 // dest - src
+ CMPU R11, R7, CR2 // < len?
+ BLT CR2, backward // there is overlap, copy backwards
+ MOVD $0, R14
+ // shlVU processes backwards, but added a forward copy option
+ // since its faster on POWER
+repeat:
+ MOVD (R6)(R14), R15 // Copy 8 bytes at a time
+ MOVD R15, (R3)(R14)
+ ADD $8, R14
+ CMP R14, R7 // More 8 bytes left?
+ BLT repeat
+ BR done
+backward:
+ ADD $-8,R7, R14
+repeatback:
+ MOVD (R6)(R14), R15 // copy x into z backwards
+ MOVD R15, (R3)(R14) // copy 8 bytes at a time
+ SUB $8, R14
+ CMP R14, $-8 // More 8 bytes left?
+ BGT repeatback
+
+done:
+ MOVD R0, c+56(FP) // c=0
+ RET
+
+//func shrVU(z, x []Word, s uint) (c Word)
+TEXT ·shrVU(SB), NOSPLIT, $0
+ MOVD z+0(FP), R3
+ MOVD x+24(FP), R6
+ MOVD s+48(FP), R9
+ MOVD z_len+8(FP), R4
+ MOVD x_len+32(FP), R7
+
+ CMP R9, R0 // s==0, copy(z,x)
+ BEQ zeroshift
+ CMP R4, R0 // len(z)==0 return
+ BEQ done
+ SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
+
+ MOVD 0(R6), R7
+ SLD R5, R7, R7 // compute x[0]<<ŝ
+ MOVD $1, R8 // iterate from i=1 to i<len(z)
+ CMP R8, R4
+ BGE loopexit // Already at end?
+
+ // vectorize if len(z) is >=3, else jump to scalar loop
+ CMP R4, $3
+ BLT scalar
+ MTVSRD R9, VS38 // s
+ VSPLTB $7, V6, V4
+ MTVSRD R5, VS39 // ŝ
+ VSPLTB $7, V7, V2
+ ADD $-2, R4, R16
+ PCALIGN $16
+loopback:
+ ADD $-1, R8, R10
+ SLD $3, R10
+ LXVD2X (R6)(R10), VS32 // load x[i-1], x[i]
+ SLD $3, R8, R12
+ LXVD2X (R6)(R12), VS33 // load x[i], x[i+1]
+
+ VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s
+ VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ
+ VOR V3, V5, V5 // Or(|) the two registers together
+ STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
+ ADD $2, R8 // Done processing 2 entries, i and i+1
+ CMP R8, R16 // Are there at least a couple of more entries left?
+ BLE loopback
+ CMP R8, R4 // Are we at the last element?
+ BEQ loopexit
+scalar:
+ ADD $-1, R8, R10
+ SLD $3, R10
+ MOVD (R6)(R10),R11
+ SRD R9, R11, R11 // x[len(z)-2] >> s
+ SLD $3, R8, R12
+ MOVD (R6)(R12), R12
+ SLD R5, R12, R12 // x[len(z)-1]<<ŝ
+ OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
+ MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
+loopexit:
+ ADD $-1, R4
+ SLD $3, R4
+ MOVD (R6)(R4), R5
+ SRD R9, R5, R5 // x[len(z)-1]>>s
+ MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s
+ MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c
+ RET
+
+zeroshift:
+ CMP R6, R0 // x is null, nothing to copy
+ BEQ done
+ CMP R6, R3 // if x is same as z, nothing to copy
+ BEQ done
+ CMP R7, R4
+ ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z
+ SLD $3, R7, R7
+ MOVD $0, R14
+repeat:
+ MOVD (R6)(R14), R15 // copy 8 bytes at a time
+ MOVD R15, (R3)(R14) // shrVU processes bytes only forwards
+ ADD $8, R14
+ CMP R14, R7 // More 8 bytes left?
+ BLT repeat
+done:
+ MOVD R0, c+56(FP)
+ RET
+
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
+TEXT ·mulAddVWW(SB), NOSPLIT, $0
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y
+ MOVD r+56(FP), R4 // R4 = r = c
+ MOVD z_len+8(FP), R11 // R11 = z_len
+
+ CMP R0, R11
+ BEQ done
+
+ MOVD 0(R8), R20
+ ADD $-1, R11
+ MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
+ MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
+ ADDC R4, R6 // R6 = z0 + r
+ ADDZE R7 // R7 = z1 + CA
+ CMP R0, R11
+ MOVD R7, R4 // R4 = c
+ MOVD R6, 0(R10) // z[i]
+ BEQ done
+
+ // We will read 4 elements per iteration
+ SRD $2, R11, R14 // R14 = z_len/4
+ DCBT (R8)
+ CMP R0, R14
+ MOVD R14, CTR // Set up the loop counter
+ BEQ tail // If R9 = 0, we can't use the loop
+ PCALIGN $32
+
+loop:
+ MOVD 8(R8), R20 // R20 = x[i]
+ MOVD 16(R8), R21 // R21 = x[i+1]
+ MOVD 24(R8), R22 // R22 = x[i+2]
+ MOVDU 32(R8), R23 // R23 = x[i+3]
+ MULLD R9, R20, R24 // R24 = z0[i]
+ MULHDU R9, R20, R20 // R20 = z1[i]
+ ADDC R4, R24 // R24 = z0[i] + c
+ ADDZE R20 // R7 = z1[i] + CA
+ MULLD R9, R21, R25
+ MULHDU R9, R21, R21
+ ADDC R20, R25
+ ADDZE R21
+ MULLD R9, R22, R26
+ MULHDU R9, R22, R22
+ MULLD R9, R23, R27
+ MULHDU R9, R23, R23
+ ADDC R21, R26
+ ADDZE R22
+ MOVD R24, 8(R10) // z[i]
+ MOVD R25, 16(R10) // z[i+1]
+ ADDC R22, R27
+ ADDZE R23,R4 // update carry
+ MOVD R26, 24(R10) // z[i+2]
+ MOVDU R27, 32(R10) // z[i+3]
+ ADD $-4, R11 // R11 = z_len - 4
+ BC 16, 0, loop // bdnz
+
+ // We may have some elements to read
+ CMP R0, R11
+ BEQ done
+
+ // Process the remaining elements, one at a time
+tail:
+ MOVDU 8(R8), R20 // R20 = x[i]
+ MULLD R9, R20, R24 // R24 = z0[i]
+ MULHDU R9, R20, R25 // R25 = z1[i]
+ ADD $-1, R11 // R11 = z_len - 1
+ ADDC R4, R24
+ ADDZE R25
+ MOVDU R24, 8(R10) // z[i]
+ CMP R0, R11
+ MOVD R25, R4 // R4 = c
+ BEQ done // If R11 = 0, we are done
+
+ MOVDU 8(R8), R20
+ MULLD R9, R20, R24
+ MULHDU R9, R20, R25
+ ADD $-1, R11
+ ADDC R4, R24
+ ADDZE R25
+ MOVDU R24, 8(R10)
+ CMP R0, R11
+ MOVD R25, R4
+ BEQ done
+
+ MOVD 8(R8), R20
+ MULLD R9, R20, R24
+ MULHDU R9, R20, R25
+ ADD $-1, R11
+ ADDC R4, R24
+ ADDZE R25
+ MOVD R24, 8(R10)
+ MOVD R25, R4
+
+done:
+ MOVD R4, c+64(FP)
+ RET
+
+// func addMulVVW(z, x []Word, y Word) (c Word)
+TEXT ·addMulVVW(SB), NOSPLIT, $0
+ MOVD z+0(FP), R10 // R10 = z[]
+ MOVD x+24(FP), R8 // R8 = x[]
+ MOVD y+48(FP), R9 // R9 = y
+ MOVD z_len+8(FP), R22 // R22 = z_len
+
+ MOVD R0, R3 // R3 will be the index register
+ CMP R0, R22
+ MOVD R0, R4 // R4 = c = 0
+ MOVD R22, CTR // Initialize loop counter
+ BEQ done
+ PCALIGN $32
+
+loop:
+ MOVD (R8)(R3), R20 // Load x[i]
+ MOVD (R10)(R3), R21 // Load z[i]
+ MULLD R9, R20, R6 // R6 = Low-order(x[i]*y)
+ MULHDU R9, R20, R7 // R7 = High-order(x[i]*y)
+ ADDC R21, R6 // R6 = z0
+ ADDZE R7 // R7 = z1
+ ADDC R4, R6 // R6 = z0 + c + 0
+ ADDZE R7, R4 // c += z1
+ MOVD R6, (R10)(R3) // Store z[i]
+ ADD $8, R3
+ BC 16, 0, loop // bdnz
+
+done:
+ MOVD R4, c+56(FP)
+ RET
+
+