diff options
Diffstat (limited to 'src/math/big/arith_ppc64x.s')
-rw-r--r-- | src/math/big/arith_ppc64x.s | 633 |
1 files changed, 633 insertions, 0 deletions
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s new file mode 100644 index 0000000..5fdbf40 --- /dev/null +++ b/src/math/big/arith_ppc64x.s @@ -0,0 +1,633 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !math_big_pure_go && (ppc64 || ppc64le) +// +build !math_big_pure_go +// +build ppc64 ppc64le + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +// func addVV(z, y, y []Word) (c Word) +// z[i] = x[i] + y[i] for all i, carrying +TEXT ·addVV(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R7 // R7 = z_len + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y[] + MOVD z+0(FP), R10 // R10 = z[] + + // If z_len = 0, we are done + CMP R0, R7 + MOVD R0, R4 + BEQ done + + // Process the first iteration out of the loop so we can + // use MOVDU and avoid 3 index registers updates. + MOVD 0(R8), R11 // R11 = x[i] + MOVD 0(R9), R12 // R12 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA + CMP R0, R7 + MOVD R15, 0(R10) // z[i] + BEQ final // If z_len was 1, we are done + + SRD $2, R7, R5 // R5 = z_len/4 + CMP R0, R5 + MOVD R5, CTR // Set up loop counter + BEQ tail // If R5 = 0, we can't use the loop + + // Process 4 elements per iteration. Unrolling this loop + // means a performance trade-off: we will lose performance + // for small values of z_len (0.90x in the worst case), but + // gain significant performance as z_len increases (up to + // 1.45x). + + PCALIGN $32 +loop: + MOVD 8(R8), R11 // R11 = x[i] + MOVD 16(R8), R12 // R12 = x[i+1] + MOVD 24(R8), R14 // R14 = x[i+2] + MOVDU 32(R8), R15 // R15 = x[i+3] + MOVD 8(R9), R16 // R16 = y[i] + MOVD 16(R9), R17 // R17 = y[i+1] + MOVD 24(R9), R18 // R18 = y[i+2] + MOVDU 32(R9), R19 // R19 = y[i+3] + ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA + ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA + ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA + ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA + MOVD R20, 8(R10) // z[i] + MOVD R21, 16(R10) // z[i+1] + MOVD R22, 24(R10) // z[i+2] + MOVDU R23, 32(R10) // z[i+3] + ADD $-4, R7 // R7 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have more elements to read + CMP R0, R7 + BEQ final + + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R11 // R11 = x[i] + MOVDU 8(R9), R16 // R16 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA + CMP R0, R7 + MOVDU R20, 8(R10) // z[i] + BEQ final // If R7 = 0, we are done + + MOVDU 8(R8), R11 + MOVDU 8(R9), R16 + ADD $-1, R7 + ADDE R11, R16, R20 + CMP R0, R7 + MOVDU R20, 8(R10) + BEQ final + + MOVD 8(R8), R11 + MOVD 8(R9), R16 + ADDE R11, R16, R20 + MOVD R20, 8(R10) + +final: + ADDZE R4 // Capture CA + +done: + MOVD R4, c+72(FP) + RET + +// func subVV(z, x, y []Word) (c Word) +// z[i] = x[i] - y[i] for all i, carrying +TEXT ·subVV(SB), NOSPLIT, $0 + MOVD z_len+8(FP), R7 // R7 = z_len + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y[] + MOVD z+0(FP), R10 // R10 = z[] + + // If z_len = 0, we are done + CMP R0, R7 + MOVD R0, R4 + BEQ done + + // Process the first iteration out of the loop so we can + // use MOVDU and avoid 3 index registers updates. + MOVD 0(R8), R11 // R11 = x[i] + MOVD 0(R9), R12 // R12 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA + CMP R0, R7 + MOVD R15, 0(R10) // z[i] + BEQ final // If z_len was 1, we are done + + SRD $2, R7, R5 // R5 = z_len/4 + CMP R0, R5 + MOVD R5, CTR // Set up loop counter + BEQ tail // If R5 = 0, we can't use the loop + + // Process 4 elements per iteration. Unrolling this loop + // means a performance trade-off: we will lose performance + // for small values of z_len (0.92x in the worst case), but + // gain significant performance as z_len increases (up to + // 1.45x). + + PCALIGN $32 +loop: + MOVD 8(R8), R11 // R11 = x[i] + MOVD 16(R8), R12 // R12 = x[i+1] + MOVD 24(R8), R14 // R14 = x[i+2] + MOVDU 32(R8), R15 // R15 = x[i+3] + MOVD 8(R9), R16 // R16 = y[i] + MOVD 16(R9), R17 // R17 = y[i+1] + MOVD 24(R9), R18 // R18 = y[i+2] + MOVDU 32(R9), R19 // R19 = y[i+3] + SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA + SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA + SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA + SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA + MOVD R20, 8(R10) // z[i] + MOVD R21, 16(R10) // z[i+1] + MOVD R22, 24(R10) // z[i+2] + MOVDU R23, 32(R10) // z[i+3] + ADD $-4, R7 // R7 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have more elements to read + CMP R0, R7 + BEQ final + + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R11 // R11 = x[i] + MOVDU 8(R9), R16 // R16 = y[i] + ADD $-1, R7 // R7 = z_len - 1 + SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA + CMP R0, R7 + MOVDU R20, 8(R10) // z[i] + BEQ final // If R7 = 0, we are done + + MOVDU 8(R8), R11 + MOVDU 8(R9), R16 + ADD $-1, R7 + SUBE R16, R11, R20 + CMP R0, R7 + MOVDU R20, 8(R10) + BEQ final + + MOVD 8(R8), R11 + MOVD 8(R9), R16 + SUBE R16, R11, R20 + MOVD R20, 8(R10) + +final: + ADDZE R4 + XOR $1, R4 + +done: + MOVD R4, c+72(FP) + RET + +// func addVW(z, x []Word, y Word) (c Word) +TEXT ·addVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R4 // R4 = y = c + MOVD z_len+8(FP), R11 // R11 = z_len + + CMP R0, R11 // If z_len is zero, return + BEQ done + + // We will process the first iteration out of the loop so we capture + // the value of c. In the subsequent iterations, we will rely on the + // value of CA set here. + MOVD 0(R8), R20 // R20 = x[i] + ADD $-1, R11 // R11 = z_len - 1 + ADDC R20, R4, R6 // R6 = x[i] + c + CMP R0, R11 // If z_len was 1, we are done + MOVD R6, 0(R10) // z[i] + BEQ final + + // We will read 4 elements per iteration + SRD $2, R11, R9 // R9 = z_len/4 + DCBT (R8) + CMP R0, R9 + MOVD R9, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop + PCALIGN $32 + +loop: + MOVD 8(R8), R20 // R20 = x[i] + MOVD 16(R8), R21 // R21 = x[i+1] + MOVD 24(R8), R22 // R22 = x[i+2] + MOVDU 32(R8), R23 // R23 = x[i+3] + ADDZE R20, R24 // R24 = x[i] + CA + ADDZE R21, R25 // R25 = x[i+1] + CA + ADDZE R22, R26 // R26 = x[i+2] + CA + ADDZE R23, R27 // R27 = x[i+3] + CA + MOVD R24, 8(R10) // z[i] + MOVD R25, 16(R10) // z[i+1] + MOVD R26, 24(R10) // z[i+2] + MOVDU R27, 32(R10) // z[i+3] + ADD $-4, R11 // R11 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ final + +tail: + MOVDU 8(R8), R20 + ADDZE R20, R24 + ADD $-1, R11 + MOVDU R24, 8(R10) + CMP R0, R11 + BEQ final + + MOVDU 8(R8), R20 + ADDZE R20, R24 + ADD $-1, R11 + MOVDU R24, 8(R10) + CMP R0, R11 + BEQ final + + MOVD 8(R8), R20 + ADDZE R20, R24 + MOVD R24, 8(R10) + +final: + ADDZE R0, R4 // c = CA +done: + MOVD R4, c+56(FP) + RET + +// func subVW(z, x []Word, y Word) (c Word) +TEXT ·subVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R4 // R4 = y = c + MOVD z_len+8(FP), R11 // R11 = z_len + + CMP R0, R11 // If z_len is zero, return + BEQ done + + // We will process the first iteration out of the loop so we capture + // the value of c. In the subsequent iterations, we will rely on the + // value of CA set here. + MOVD 0(R8), R20 // R20 = x[i] + ADD $-1, R11 // R11 = z_len - 1 + SUBC R4, R20, R6 // R6 = x[i] - c + CMP R0, R11 // If z_len was 1, we are done + MOVD R6, 0(R10) // z[i] + BEQ final + + // We will read 4 elements per iteration + SRD $2, R11, R9 // R9 = z_len/4 + DCBT (R8) + CMP R0, R9 + MOVD R9, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop + + // The loop here is almost the same as the one used in s390x, but + // we don't need to capture CA every iteration because we've already + // done that above. + + PCALIGN $32 +loop: + MOVD 8(R8), R20 + MOVD 16(R8), R21 + MOVD 24(R8), R22 + MOVDU 32(R8), R23 + SUBE R0, R20 + SUBE R0, R21 + SUBE R0, R22 + SUBE R0, R23 + MOVD R20, 8(R10) + MOVD R21, 16(R10) + MOVD R22, 24(R10) + MOVDU R23, 32(R10) + ADD $-4, R11 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ final + +tail: + MOVDU 8(R8), R20 + SUBE R0, R20 + ADD $-1, R11 + MOVDU R20, 8(R10) + CMP R0, R11 + BEQ final + + MOVDU 8(R8), R20 + SUBE R0, R20 + ADD $-1, R11 + MOVDU R20, 8(R10) + CMP R0, R11 + BEQ final + + MOVD 8(R8), R20 + SUBE R0, R20 + MOVD R20, 8(R10) + +final: + // Capture CA + SUBE R4, R4 + NEG R4, R4 + +done: + MOVD R4, c+56(FP) + RET + +//func shlVU(z, x []Word, s uint) (c Word) +TEXT ·shlVU(SB), NOSPLIT, $0 + MOVD z+0(FP), R3 + MOVD x+24(FP), R6 + MOVD s+48(FP), R9 + MOVD z_len+8(FP), R4 + MOVD x_len+32(FP), R7 + CMP R9, R0 // s==0 copy(z,x) + BEQ zeroshift + CMP R4, R0 // len(z)==0 return + BEQ done + + ADD $-1, R4, R5 // len(z)-1 + SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) + SLD $3, R5, R7 + ADD R6, R7, R15 // save starting address &x[len(z)-1] + ADD R3, R7, R16 // save starting address &z[len(z)-1] + MOVD (R6)(R7), R14 + SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7 + CMP R5, R0 // iterate from i=len(z)-1 to 0 + BEQ loopexit // Already at end? + MOVD 0(R15),R10 // x[i] + PCALIGN $32 +shloop: + SLD R9, R10, R10 // x[i]<<s + MOVDU -8(R15), R14 + SRD R4, R14, R11 // x[i-1]>>ŝ + OR R11, R10, R10 + MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ + MOVD R14, R10 // reuse x[i-1] for next iteration + ADD $-8, R16 // i-- + CMP R15, R6 // &x[i-1]>&x[0]? + BGT shloop +loopexit: + MOVD 0(R6), R4 + SLD R9, R4, R4 + MOVD R4, 0(R3) // z[0]=x[0]<<s + MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c + RET + +zeroshift: + CMP R6, R0 // x is null, nothing to copy + BEQ done + CMP R6, R3 // if x is same as z, nothing to copy + BEQ done + CMP R7, R4 + ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z + SLD $3, R7, R7 + SUB R6, R3, R11 // dest - src + CMPU R11, R7, CR2 // < len? + BLT CR2, backward // there is overlap, copy backwards + MOVD $0, R14 + // shlVU processes backwards, but added a forward copy option + // since its faster on POWER +repeat: + MOVD (R6)(R14), R15 // Copy 8 bytes at a time + MOVD R15, (R3)(R14) + ADD $8, R14 + CMP R14, R7 // More 8 bytes left? + BLT repeat + BR done +backward: + ADD $-8,R7, R14 +repeatback: + MOVD (R6)(R14), R15 // copy x into z backwards + MOVD R15, (R3)(R14) // copy 8 bytes at a time + SUB $8, R14 + CMP R14, $-8 // More 8 bytes left? + BGT repeatback + +done: + MOVD R0, c+56(FP) // c=0 + RET + +//func shrVU(z, x []Word, s uint) (c Word) +TEXT ·shrVU(SB), NOSPLIT, $0 + MOVD z+0(FP), R3 + MOVD x+24(FP), R6 + MOVD s+48(FP), R9 + MOVD z_len+8(FP), R4 + MOVD x_len+32(FP), R7 + + CMP R9, R0 // s==0, copy(z,x) + BEQ zeroshift + CMP R4, R0 // len(z)==0 return + BEQ done + SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64) + + MOVD 0(R6), R7 + SLD R5, R7, R7 // compute x[0]<<ŝ + MOVD $1, R8 // iterate from i=1 to i<len(z) + CMP R8, R4 + BGE loopexit // Already at end? + + // vectorize if len(z) is >=3, else jump to scalar loop + CMP R4, $3 + BLT scalar + MTVSRD R9, VS38 // s + VSPLTB $7, V6, V4 + MTVSRD R5, VS39 // ŝ + VSPLTB $7, V7, V2 + ADD $-2, R4, R16 + PCALIGN $16 +loopback: + ADD $-1, R8, R10 + SLD $3, R10 + LXVD2X (R6)(R10), VS32 // load x[i-1], x[i] + SLD $3, R8, R12 + LXVD2X (R6)(R12), VS33 // load x[i], x[i+1] + + VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s + VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ + VOR V3, V5, V5 // Or(|) the two registers together + STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i] + ADD $2, R8 // Done processing 2 entries, i and i+1 + CMP R8, R16 // Are there at least a couple of more entries left? + BLE loopback + CMP R8, R4 // Are we at the last element? + BEQ loopexit +scalar: + ADD $-1, R8, R10 + SLD $3, R10 + MOVD (R6)(R10),R11 + SRD R9, R11, R11 // x[len(z)-2] >> s + SLD $3, R8, R12 + MOVD (R6)(R12), R12 + SLD R5, R12, R12 // x[len(z)-1]<<ŝ + OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ + MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ +loopexit: + ADD $-1, R4 + SLD $3, R4 + MOVD (R6)(R4), R5 + SRD R9, R5, R5 // x[len(z)-1]>>s + MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s + MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c + RET + +zeroshift: + CMP R6, R0 // x is null, nothing to copy + BEQ done + CMP R6, R3 // if x is same as z, nothing to copy + BEQ done + CMP R7, R4 + ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z + SLD $3, R7, R7 + MOVD $0, R14 +repeat: + MOVD (R6)(R14), R15 // copy 8 bytes at a time + MOVD R15, (R3)(R14) // shrVU processes bytes only forwards + ADD $8, R14 + CMP R14, R7 // More 8 bytes left? + BLT repeat +done: + MOVD R0, c+56(FP) + RET + +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB), NOSPLIT, $0 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y + MOVD r+56(FP), R4 // R4 = r = c + MOVD z_len+8(FP), R11 // R11 = z_len + + CMP R0, R11 + BEQ done + + MOVD 0(R8), R20 + ADD $-1, R11 + MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) + MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) + ADDC R4, R6 // R6 = z0 + r + ADDZE R7 // R7 = z1 + CA + CMP R0, R11 + MOVD R7, R4 // R4 = c + MOVD R6, 0(R10) // z[i] + BEQ done + + // We will read 4 elements per iteration + SRD $2, R11, R14 // R14 = z_len/4 + DCBT (R8) + CMP R0, R14 + MOVD R14, CTR // Set up the loop counter + BEQ tail // If R9 = 0, we can't use the loop + PCALIGN $32 + +loop: + MOVD 8(R8), R20 // R20 = x[i] + MOVD 16(R8), R21 // R21 = x[i+1] + MOVD 24(R8), R22 // R22 = x[i+2] + MOVDU 32(R8), R23 // R23 = x[i+3] + MULLD R9, R20, R24 // R24 = z0[i] + MULHDU R9, R20, R20 // R20 = z1[i] + ADDC R4, R24 // R24 = z0[i] + c + ADDZE R20 // R7 = z1[i] + CA + MULLD R9, R21, R25 + MULHDU R9, R21, R21 + ADDC R20, R25 + ADDZE R21 + MULLD R9, R22, R26 + MULHDU R9, R22, R22 + MULLD R9, R23, R27 + MULHDU R9, R23, R23 + ADDC R21, R26 + ADDZE R22 + MOVD R24, 8(R10) // z[i] + MOVD R25, 16(R10) // z[i+1] + ADDC R22, R27 + ADDZE R23,R4 // update carry + MOVD R26, 24(R10) // z[i+2] + MOVDU R27, 32(R10) // z[i+3] + ADD $-4, R11 // R11 = z_len - 4 + BC 16, 0, loop // bdnz + + // We may have some elements to read + CMP R0, R11 + BEQ done + + // Process the remaining elements, one at a time +tail: + MOVDU 8(R8), R20 // R20 = x[i] + MULLD R9, R20, R24 // R24 = z0[i] + MULHDU R9, R20, R25 // R25 = z1[i] + ADD $-1, R11 // R11 = z_len - 1 + ADDC R4, R24 + ADDZE R25 + MOVDU R24, 8(R10) // z[i] + CMP R0, R11 + MOVD R25, R4 // R4 = c + BEQ done // If R11 = 0, we are done + + MOVDU 8(R8), R20 + MULLD R9, R20, R24 + MULHDU R9, R20, R25 + ADD $-1, R11 + ADDC R4, R24 + ADDZE R25 + MOVDU R24, 8(R10) + CMP R0, R11 + MOVD R25, R4 + BEQ done + + MOVD 8(R8), R20 + MULLD R9, R20, R24 + MULHDU R9, R20, R25 + ADD $-1, R11 + ADDC R4, R24 + ADDZE R25 + MOVD R24, 8(R10) + MOVD R25, R4 + +done: + MOVD R4, c+64(FP) + RET + +// func addMulVVW(z, x []Word, y Word) (c Word) +TEXT ·addMulVVW(SB), NOSPLIT, $0 + MOVD z+0(FP), R10 // R10 = z[] + MOVD x+24(FP), R8 // R8 = x[] + MOVD y+48(FP), R9 // R9 = y + MOVD z_len+8(FP), R22 // R22 = z_len + + MOVD R0, R3 // R3 will be the index register + CMP R0, R22 + MOVD R0, R4 // R4 = c = 0 + MOVD R22, CTR // Initialize loop counter + BEQ done + PCALIGN $32 + +loop: + MOVD (R8)(R3), R20 // Load x[i] + MOVD (R10)(R3), R21 // Load z[i] + MULLD R9, R20, R6 // R6 = Low-order(x[i]*y) + MULHDU R9, R20, R7 // R7 = High-order(x[i]*y) + ADDC R21, R6 // R6 = z0 + ADDZE R7 // R7 = z1 + ADDC R4, R6 // R6 = z0 + c + 0 + ADDZE R7, R4 // c += z1 + MOVD R6, (R10)(R3) // Store z[i] + ADD $8, R3 + BC 16, 0, loop // bdnz + +done: + MOVD R4, c+56(FP) + RET + + |