diff options
Diffstat (limited to 'src/math/big/arith_arm64.s')
-rw-r--r-- | src/math/big/arith_arm64.s | 584 |
1 files changed, 584 insertions, 0 deletions
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s new file mode 100644 index 0000000..7bfe08e --- /dev/null +++ b/src/math/big/arith_arm64.s @@ -0,0 +1,584 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !math_big_pure_go +// +build !math_big_pure_go + +#include "textflag.h" + +// This file provides fast assembly versions for the elementary +// arithmetic operations on vectors implemented in arith.go. + +// TODO: Consider re-implementing using Advanced SIMD +// once the assembler supports those instructions. + +// func mulWW(x, y Word) (z1, z0 Word) +TEXT ·mulWW(SB),NOSPLIT,$0 + MOVD x+0(FP), R0 + MOVD y+8(FP), R1 + MUL R0, R1, R2 + UMULH R0, R1, R3 + MOVD R3, z1+16(FP) + MOVD R2, z0+24(FP) + RET + + +// func addVV(z, x, y []Word) (c Word) +TEXT ·addVV(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R10 + ADDS $0, R0 // clear carry flag + TBZ $0, R0, two + MOVD.P 8(R8), R11 + MOVD.P 8(R9), R15 + ADCS R15, R11 + MOVD.P R11, 8(R10) + SUB $1, R0 +two: + TBZ $1, R0, loop + LDP.P 16(R8), (R11, R12) + LDP.P 16(R9), (R15, R16) + ADCS R15, R11 + ADCS R16, R12 + STP.P (R11, R12), 16(R10) + SUB $2, R0 +loop: + CBZ R0, done // careful not to touch the carry flag + LDP.P 32(R8), (R11, R12) + LDP -16(R8), (R13, R14) + LDP.P 32(R9), (R15, R16) + LDP -16(R9), (R17, R19) + ADCS R15, R11 + ADCS R16, R12 + ADCS R17, R13 + ADCS R19, R14 + STP.P (R11, R12), 32(R10) + STP (R13, R14), -16(R10) + SUB $4, R0 + B loop +done: + CSET HS, R0 // extract carry flag + MOVD R0, c+72(FP) + RET + + +// func subVV(z, x, y []Word) (c Word) +TEXT ·subVV(SB),NOSPLIT,$0 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R8 + MOVD y+48(FP), R9 + MOVD z+0(FP), R10 + CMP R0, R0 // set carry flag + TBZ $0, R0, two + MOVD.P 8(R8), R11 + MOVD.P 8(R9), R15 + SBCS R15, R11 + MOVD.P R11, 8(R10) + SUB $1, R0 +two: + TBZ $1, R0, loop + LDP.P 16(R8), (R11, R12) + LDP.P 16(R9), (R15, R16) + SBCS R15, R11 + SBCS R16, R12 + STP.P (R11, R12), 16(R10) + SUB $2, R0 +loop: + CBZ R0, done // careful not to touch the carry flag + LDP.P 32(R8), (R11, R12) + LDP -16(R8), (R13, R14) + LDP.P 32(R9), (R15, R16) + LDP -16(R9), (R17, R19) + SBCS R15, R11 + SBCS R16, R12 + SBCS R17, R13 + SBCS R19, R14 + STP.P (R11, R12), 32(R10) + STP (R13, R14), -16(R10) + SUB $4, R0 + B loop +done: + CSET LO, R0 // extract carry flag + MOVD R0, c+72(FP) + RET + +#define vwOneOp(instr, op1) \ + MOVD.P 8(R1), R4; \ + instr op1, R4; \ + MOVD.P R4, 8(R3); + +// handle the first 1~4 elements before starting iteration in addVW/subVW +#define vwPreIter(instr1, instr2, counter, target) \ + vwOneOp(instr1, R2); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); + +// do one iteration of add or sub in addVW/subVW +#define vwOneIter(instr, counter, exit) \ + CBZ counter, exit; \ // careful not to touch the carry flag + LDP.P 32(R1), (R4, R5); \ + LDP -16(R1), (R6, R7); \ + instr $0, R4, R8; \ + instr $0, R5, R9; \ + instr $0, R6, R10; \ + instr $0, R7, R11; \ + STP.P (R8, R9), 32(R3); \ + STP (R10, R11), -16(R3); \ + SUB $4, counter; + +// do one iteration of copy in addVW/subVW +#define vwOneIterCopy(counter, exit) \ + CBZ counter, exit; \ + LDP.P 32(R1), (R4, R5); \ + LDP -16(R1), (R6, R7); \ + STP.P (R4, R5), 32(R3); \ + STP (R6, R7), -16(R3); \ + SUB $4, counter; + +// func addVW(z, x []Word, y Word) (c Word) +// The 'large' branch handles large 'z'. It checks the carry flag on every iteration +// and switches to copy if we are done with carries. The copying is skipped as well +// if 'x' and 'z' happen to share the same underlying storage. +// The overhead of the checking and branching is visible when 'z' are small (~5%), +// so set a threshold of 32, and remain the small-sized part entirely untouched. +TEXT ·addVW(SB),NOSPLIT,$0 + MOVD z+0(FP), R3 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R1 + MOVD y+48(FP), R2 + CMP $32, R0 + BGE large // large-sized 'z' and 'x' + CBZ R0, len0 // the length of z is 0 + MOVD.P 8(R1), R4 + ADDS R2, R4 // z[0] = x[0] + y, set carry + MOVD.P R4, 8(R3) + SUB $1, R0 + CBZ R0, len1 // the length of z is 1 + TBZ $0, R0, two + MOVD.P 8(R1), R4 // do it once + ADCS $0, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 +two: // do it twice + TBZ $1, R0, loop + LDP.P 16(R1), (R4, R5) + ADCS $0, R4, R8 // c, z[i] = x[i] + c + ADCS $0, R5, R9 + STP.P (R8, R9), 16(R3) + SUB $2, R0 +loop: // do four times per round + vwOneIter(ADCS, R0, len1) + B loop +len1: + CSET HS, R2 // extract carry flag +len0: + MOVD R2, c+56(FP) +done: + RET +large: + AND $0x3, R0, R10 + AND $~0x3, R0 + // unrolling for the first 1~4 elements to avoid saving the carry + // flag in each step, adjust $R0 if we unrolled 4 elements + vwPreIter(ADDS, ADCS, R10, add4) + SUB $4, R0 +add4: + BCC copy + vwOneIter(ADCS, R0, len1) + B add4 +copy: + MOVD ZR, c+56(FP) + CMP R1, R3 + BEQ done +copy_4: // no carry flag, copy the rest + vwOneIterCopy(R0, done) + B copy_4 + +// func subVW(z, x []Word, y Word) (c Word) +// The 'large' branch handles large 'z'. It checks the carry flag on every iteration +// and switches to copy if we are done with carries. The copying is skipped as well +// if 'x' and 'z' happen to share the same underlying storage. +// The overhead of the checking and branching is visible when 'z' are small (~5%), +// so set a threshold of 32, and remain the small-sized part entirely untouched. +TEXT ·subVW(SB),NOSPLIT,$0 + MOVD z+0(FP), R3 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R1 + MOVD y+48(FP), R2 + CMP $32, R0 + BGE large // large-sized 'z' and 'x' + CBZ R0, len0 // the length of z is 0 + MOVD.P 8(R1), R4 + SUBS R2, R4 // z[0] = x[0] - y, set carry + MOVD.P R4, 8(R3) + SUB $1, R0 + CBZ R0, len1 // the length of z is 1 + TBZ $0, R0, two // do it once + MOVD.P 8(R1), R4 + SBCS $0, R4 + MOVD.P R4, 8(R3) + SUB $1, R0 +two: // do it twice + TBZ $1, R0, loop + LDP.P 16(R1), (R4, R5) + SBCS $0, R4, R8 // c, z[i] = x[i] + c + SBCS $0, R5, R9 + STP.P (R8, R9), 16(R3) + SUB $2, R0 +loop: // do four times per round + vwOneIter(SBCS, R0, len1) + B loop +len1: + CSET LO, R2 // extract carry flag +len0: + MOVD R2, c+56(FP) +done: + RET +large: + AND $0x3, R0, R10 + AND $~0x3, R0 + // unrolling for the first 1~4 elements to avoid saving the carry + // flag in each step, adjust $R0 if we unrolled 4 elements + vwPreIter(SUBS, SBCS, R10, sub4) + SUB $4, R0 +sub4: + BCS copy + vwOneIter(SBCS, R0, len1) + B sub4 +copy: + MOVD ZR, c+56(FP) + CMP R1, R3 + BEQ done +copy_4: // no carry flag, copy the rest + vwOneIterCopy(R0, done) + B copy_4 + +// func shlVU(z, x []Word, s uint) (c Word) +// This implementation handles the shift operation from the high word to the low word, +// which may be an error for the case where the low word of x overlaps with the high +// word of z. When calling this function directly, you need to pay attention to this +// situation. +TEXT ·shlVU(SB),NOSPLIT,$0 + LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z) + MOVD x+24(FP), R2 + MOVD s+48(FP), R3 + ADD R1<<3, R0 // R0 = &z[n] + ADD R1<<3, R2 // R2 = &x[n] + CBZ R1, len0 + CBZ R3, copy // if the number of shift is 0, just copy x to z + MOVD $64, R4 + SUB R3, R4 + // handling the most significant element x[n-1] + MOVD.W -8(R2), R6 + LSR R4, R6, R5 // return value + LSL R3, R6, R8 // x[i] << s + SUB $1, R1 +one: TBZ $0, R1, two + MOVD.W -8(R2), R6 + LSR R4, R6, R7 + ORR R8, R7 + LSL R3, R6, R8 + SUB $1, R1 + MOVD.W R7, -8(R0) +two: + TBZ $1, R1, loop + LDP.W -16(R2), (R6, R7) + LSR R4, R7, R10 + ORR R8, R10 + LSL R3, R7 + LSR R4, R6, R9 + ORR R7, R9 + LSL R3, R6, R8 + SUB $2, R1 + STP.W (R9, R10), -16(R0) +loop: + CBZ R1, done + LDP.W -32(R2), (R10, R11) + LDP 16(R2), (R12, R13) + LSR R4, R13, R23 + ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s)) + LSL R3, R13 + LSR R4, R12, R22 + ORR R13, R22 + LSL R3, R12 + LSR R4, R11, R21 + ORR R12, R21 + LSL R3, R11 + LSR R4, R10, R20 + ORR R11, R20 + LSL R3, R10, R8 + STP.W (R20, R21), -32(R0) + STP (R22, R23), 16(R0) + SUB $4, R1 + B loop +done: + MOVD.W R8, -8(R0) // the first element x[0] + MOVD R5, c+56(FP) // the part moved out from x[n-1] + RET +copy: + CMP R0, R2 + BEQ len0 + TBZ $0, R1, ctwo + MOVD.W -8(R2), R4 + MOVD.W R4, -8(R0) + SUB $1, R1 +ctwo: + TBZ $1, R1, cloop + LDP.W -16(R2), (R4, R5) + STP.W (R4, R5), -16(R0) + SUB $2, R1 +cloop: + CBZ R1, len0 + LDP.W -32(R2), (R4, R5) + LDP 16(R2), (R6, R7) + STP.W (R4, R5), -32(R0) + STP (R6, R7), 16(R0) + SUB $4, R1 + B cloop +len0: + MOVD $0, c+56(FP) + RET + +// func shrVU(z, x []Word, s uint) (c Word) +// This implementation handles the shift operation from the low word to the high word, +// which may be an error for the case where the high word of x overlaps with the low +// word of z. When calling this function directly, you need to pay attention to this +// situation. +TEXT ·shrVU(SB),NOSPLIT,$0 + MOVD z+0(FP), R0 + MOVD z_len+8(FP), R1 + MOVD x+24(FP), R2 + MOVD s+48(FP), R3 + MOVD $0, R8 + MOVD $64, R4 + SUB R3, R4 + CBZ R1, len0 + CBZ R3, copy // if the number of shift is 0, just copy x to z + + MOVD.P 8(R2), R20 + LSR R3, R20, R8 + LSL R4, R20 + MOVD R20, c+56(FP) // deal with the first element + SUB $1, R1 + + TBZ $0, R1, two + MOVD.P 8(R2), R6 + LSL R4, R6, R20 + ORR R8, R20 + LSR R3, R6, R8 + MOVD.P R20, 8(R0) + SUB $1, R1 +two: + TBZ $1, R1, loop + LDP.P 16(R2), (R6, R7) + LSL R4, R6, R20 + LSR R3, R6 + ORR R8, R20 + LSL R4, R7, R21 + LSR R3, R7, R8 + ORR R6, R21 + STP.P (R20, R21), 16(R0) + SUB $2, R1 +loop: + CBZ R1, done + LDP.P 32(R2), (R10, R11) + LDP -16(R2), (R12, R13) + LSL R4, R10, R20 + LSR R3, R10 + ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s)) + LSL R4, R11, R21 + LSR R3, R11 + ORR R10, R21 + LSL R4, R12, R22 + LSR R3, R12 + ORR R11, R22 + LSL R4, R13, R23 + LSR R3, R13, R8 + ORR R12, R23 + STP.P (R20, R21), 32(R0) + STP (R22, R23), -16(R0) + SUB $4, R1 + B loop +done: + MOVD R8, (R0) // deal with the last element + RET +copy: + CMP R0, R2 + BEQ len0 + TBZ $0, R1, ctwo + MOVD.P 8(R2), R3 + MOVD.P R3, 8(R0) + SUB $1, R1 +ctwo: + TBZ $1, R1, cloop + LDP.P 16(R2), (R4, R5) + STP.P (R4, R5), 16(R0) + SUB $2, R1 +cloop: + CBZ R1, len0 + LDP.P 32(R2), (R4, R5) + LDP -16(R2), (R6, R7) + STP.P (R4, R5), 32(R0) + STP (R6, R7), -16(R0) + SUB $4, R1 + B cloop +len0: + MOVD $0, c+56(FP) + RET + + +// func mulAddVWW(z, x []Word, y, r Word) (c Word) +TEXT ·mulAddVWW(SB),NOSPLIT,$0 + MOVD z+0(FP), R1 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R2 + MOVD y+48(FP), R3 + MOVD r+56(FP), R4 + // c, z = x * y + r + TBZ $0, R0, two + MOVD.P 8(R2), R5 + MUL R3, R5, R7 + UMULH R3, R5, R8 + ADDS R4, R7 + ADC $0, R8, R4 // c, z[i] = x[i] * y + r + MOVD.P R7, 8(R1) + SUB $1, R0 +two: + TBZ $1, R0, loop + LDP.P 16(R2), (R5, R6) + MUL R3, R5, R10 + UMULH R3, R5, R11 + ADDS R4, R10 + MUL R3, R6, R12 + UMULH R3, R6, R13 + ADCS R12, R11 + ADC $0, R13, R4 + + STP.P (R10, R11), 16(R1) + SUB $2, R0 +loop: + CBZ R0, done + LDP.P 32(R2), (R5, R6) + LDP -16(R2), (R7, R8) + + MUL R3, R5, R10 + UMULH R3, R5, R11 + ADDS R4, R10 + MUL R3, R6, R12 + UMULH R3, R6, R13 + ADCS R11, R12 + + MUL R3, R7, R14 + UMULH R3, R7, R15 + ADCS R13, R14 + MUL R3, R8, R16 + UMULH R3, R8, R17 + ADCS R15, R16 + ADC $0, R17, R4 + + STP.P (R10, R12), 32(R1) + STP (R14, R16), -16(R1) + SUB $4, R0 + B loop +done: + MOVD R4, c+64(FP) + RET + + +// func addMulVVW(z, x []Word, y Word) (c Word) +TEXT ·addMulVVW(SB),NOSPLIT,$0 + MOVD z+0(FP), R1 + MOVD z_len+8(FP), R0 + MOVD x+24(FP), R2 + MOVD y+48(FP), R3 + MOVD $0, R4 + + TBZ $0, R0, two + + MOVD.P 8(R2), R5 + MOVD (R1), R6 + + MUL R5, R3, R7 + UMULH R5, R3, R8 + + ADDS R7, R6 + ADC $0, R8, R4 + + MOVD.P R6, 8(R1) + SUB $1, R0 + +two: + TBZ $1, R0, loop + + LDP.P 16(R2), (R5, R10) + LDP (R1), (R6, R11) + + MUL R10, R3, R13 + UMULH R10, R3, R12 + + MUL R5, R3, R7 + UMULH R5, R3, R8 + + ADDS R4, R6 + ADCS R13, R11 + ADC $0, R12 + + ADDS R7, R6 + ADCS R8, R11 + ADC $0, R12, R4 + + STP.P (R6, R11), 16(R1) + SUB $2, R0 + +// The main loop of this code operates on a block of 4 words every iteration +// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9] +// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next +// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z. +loop: + CBZ R0, done + + LDP.P 16(R2), (R5, R6) + LDP.P 16(R2), (R7, R8) + + LDP (R1), (R9, R10) + ADDS R4, R9 + MUL R6, R3, R14 + ADCS R14, R10 + MUL R7, R3, R15 + LDP 16(R1), (R11, R12) + ADCS R15, R11 + MUL R8, R3, R16 + ADCS R16, R12 + UMULH R8, R3, R20 + ADC $0, R20 + + MUL R5, R3, R13 + ADDS R13, R9 + UMULH R5, R3, R17 + ADCS R17, R10 + UMULH R6, R3, R21 + STP.P (R9, R10), 16(R1) + ADCS R21, R11 + UMULH R7, R3, R19 + ADCS R19, R12 + STP.P (R11, R12), 16(R1) + ADC $0, R20, R4 + + SUB $4, R0 + B loop + +done: + MOVD R4, c+56(FP) + RET + + |