summaryrefslogtreecommitdiffstats
path: root/src/math/big/arith_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/math/big/arith_arm64.s')
-rw-r--r--src/math/big/arith_arm64.s583
1 files changed, 583 insertions, 0 deletions
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s
new file mode 100644
index 0000000..22357d0
--- /dev/null
+++ b/src/math/big/arith_arm64.s
@@ -0,0 +1,583 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !math_big_pure_go
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+// TODO: Consider re-implementing using Advanced SIMD
+// once the assembler supports those instructions.
+
+// func mulWW(x, y Word) (z1, z0 Word)
+TEXT ·mulWW(SB),NOSPLIT,$0
+ MOVD x+0(FP), R0
+ MOVD y+8(FP), R1
+ MUL R0, R1, R2
+ UMULH R0, R1, R3
+ MOVD R3, z1+16(FP)
+ MOVD R2, z0+24(FP)
+ RET
+
+
+// func addVV(z, x, y []Word) (c Word)
+TEXT ·addVV(SB),NOSPLIT,$0
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R8
+ MOVD y+48(FP), R9
+ MOVD z+0(FP), R10
+ ADDS $0, R0 // clear carry flag
+ TBZ $0, R0, two
+ MOVD.P 8(R8), R11
+ MOVD.P 8(R9), R15
+ ADCS R15, R11
+ MOVD.P R11, 8(R10)
+ SUB $1, R0
+two:
+ TBZ $1, R0, loop
+ LDP.P 16(R8), (R11, R12)
+ LDP.P 16(R9), (R15, R16)
+ ADCS R15, R11
+ ADCS R16, R12
+ STP.P (R11, R12), 16(R10)
+ SUB $2, R0
+loop:
+ CBZ R0, done // careful not to touch the carry flag
+ LDP.P 32(R8), (R11, R12)
+ LDP -16(R8), (R13, R14)
+ LDP.P 32(R9), (R15, R16)
+ LDP -16(R9), (R17, R19)
+ ADCS R15, R11
+ ADCS R16, R12
+ ADCS R17, R13
+ ADCS R19, R14
+ STP.P (R11, R12), 32(R10)
+ STP (R13, R14), -16(R10)
+ SUB $4, R0
+ B loop
+done:
+ CSET HS, R0 // extract carry flag
+ MOVD R0, c+72(FP)
+ RET
+
+
+// func subVV(z, x, y []Word) (c Word)
+TEXT ·subVV(SB),NOSPLIT,$0
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R8
+ MOVD y+48(FP), R9
+ MOVD z+0(FP), R10
+ CMP R0, R0 // set carry flag
+ TBZ $0, R0, two
+ MOVD.P 8(R8), R11
+ MOVD.P 8(R9), R15
+ SBCS R15, R11
+ MOVD.P R11, 8(R10)
+ SUB $1, R0
+two:
+ TBZ $1, R0, loop
+ LDP.P 16(R8), (R11, R12)
+ LDP.P 16(R9), (R15, R16)
+ SBCS R15, R11
+ SBCS R16, R12
+ STP.P (R11, R12), 16(R10)
+ SUB $2, R0
+loop:
+ CBZ R0, done // careful not to touch the carry flag
+ LDP.P 32(R8), (R11, R12)
+ LDP -16(R8), (R13, R14)
+ LDP.P 32(R9), (R15, R16)
+ LDP -16(R9), (R17, R19)
+ SBCS R15, R11
+ SBCS R16, R12
+ SBCS R17, R13
+ SBCS R19, R14
+ STP.P (R11, R12), 32(R10)
+ STP (R13, R14), -16(R10)
+ SUB $4, R0
+ B loop
+done:
+ CSET LO, R0 // extract carry flag
+ MOVD R0, c+72(FP)
+ RET
+
+#define vwOneOp(instr, op1) \
+ MOVD.P 8(R1), R4; \
+ instr op1, R4; \
+ MOVD.P R4, 8(R3);
+
+// handle the first 1~4 elements before starting iteration in addVW/subVW
+#define vwPreIter(instr1, instr2, counter, target) \
+ vwOneOp(instr1, R2); \
+ SUB $1, counter; \
+ CBZ counter, target; \
+ vwOneOp(instr2, $0); \
+ SUB $1, counter; \
+ CBZ counter, target; \
+ vwOneOp(instr2, $0); \
+ SUB $1, counter; \
+ CBZ counter, target; \
+ vwOneOp(instr2, $0);
+
+// do one iteration of add or sub in addVW/subVW
+#define vwOneIter(instr, counter, exit) \
+ CBZ counter, exit; \ // careful not to touch the carry flag
+ LDP.P 32(R1), (R4, R5); \
+ LDP -16(R1), (R6, R7); \
+ instr $0, R4, R8; \
+ instr $0, R5, R9; \
+ instr $0, R6, R10; \
+ instr $0, R7, R11; \
+ STP.P (R8, R9), 32(R3); \
+ STP (R10, R11), -16(R3); \
+ SUB $4, counter;
+
+// do one iteration of copy in addVW/subVW
+#define vwOneIterCopy(counter, exit) \
+ CBZ counter, exit; \
+ LDP.P 32(R1), (R4, R5); \
+ LDP -16(R1), (R6, R7); \
+ STP.P (R4, R5), 32(R3); \
+ STP (R6, R7), -16(R3); \
+ SUB $4, counter;
+
+// func addVW(z, x []Word, y Word) (c Word)
+// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
+// and switches to copy if we are done with carries. The copying is skipped as well
+// if 'x' and 'z' happen to share the same underlying storage.
+// The overhead of the checking and branching is visible when 'z' are small (~5%),
+// so set a threshold of 32, and remain the small-sized part entirely untouched.
+TEXT ·addVW(SB),NOSPLIT,$0
+ MOVD z+0(FP), R3
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R1
+ MOVD y+48(FP), R2
+ CMP $32, R0
+ BGE large // large-sized 'z' and 'x'
+ CBZ R0, len0 // the length of z is 0
+ MOVD.P 8(R1), R4
+ ADDS R2, R4 // z[0] = x[0] + y, set carry
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+ CBZ R0, len1 // the length of z is 1
+ TBZ $0, R0, two
+ MOVD.P 8(R1), R4 // do it once
+ ADCS $0, R4
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+two: // do it twice
+ TBZ $1, R0, loop
+ LDP.P 16(R1), (R4, R5)
+ ADCS $0, R4, R8 // c, z[i] = x[i] + c
+ ADCS $0, R5, R9
+ STP.P (R8, R9), 16(R3)
+ SUB $2, R0
+loop: // do four times per round
+ vwOneIter(ADCS, R0, len1)
+ B loop
+len1:
+ CSET HS, R2 // extract carry flag
+len0:
+ MOVD R2, c+56(FP)
+done:
+ RET
+large:
+ AND $0x3, R0, R10
+ AND $~0x3, R0
+ // unrolling for the first 1~4 elements to avoid saving the carry
+ // flag in each step, adjust $R0 if we unrolled 4 elements
+ vwPreIter(ADDS, ADCS, R10, add4)
+ SUB $4, R0
+add4:
+ BCC copy
+ vwOneIter(ADCS, R0, len1)
+ B add4
+copy:
+ MOVD ZR, c+56(FP)
+ CMP R1, R3
+ BEQ done
+copy_4: // no carry flag, copy the rest
+ vwOneIterCopy(R0, done)
+ B copy_4
+
+// func subVW(z, x []Word, y Word) (c Word)
+// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
+// and switches to copy if we are done with carries. The copying is skipped as well
+// if 'x' and 'z' happen to share the same underlying storage.
+// The overhead of the checking and branching is visible when 'z' are small (~5%),
+// so set a threshold of 32, and remain the small-sized part entirely untouched.
+TEXT ·subVW(SB),NOSPLIT,$0
+ MOVD z+0(FP), R3
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R1
+ MOVD y+48(FP), R2
+ CMP $32, R0
+ BGE large // large-sized 'z' and 'x'
+ CBZ R0, len0 // the length of z is 0
+ MOVD.P 8(R1), R4
+ SUBS R2, R4 // z[0] = x[0] - y, set carry
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+ CBZ R0, len1 // the length of z is 1
+ TBZ $0, R0, two // do it once
+ MOVD.P 8(R1), R4
+ SBCS $0, R4
+ MOVD.P R4, 8(R3)
+ SUB $1, R0
+two: // do it twice
+ TBZ $1, R0, loop
+ LDP.P 16(R1), (R4, R5)
+ SBCS $0, R4, R8 // c, z[i] = x[i] + c
+ SBCS $0, R5, R9
+ STP.P (R8, R9), 16(R3)
+ SUB $2, R0
+loop: // do four times per round
+ vwOneIter(SBCS, R0, len1)
+ B loop
+len1:
+ CSET LO, R2 // extract carry flag
+len0:
+ MOVD R2, c+56(FP)
+done:
+ RET
+large:
+ AND $0x3, R0, R10
+ AND $~0x3, R0
+ // unrolling for the first 1~4 elements to avoid saving the carry
+ // flag in each step, adjust $R0 if we unrolled 4 elements
+ vwPreIter(SUBS, SBCS, R10, sub4)
+ SUB $4, R0
+sub4:
+ BCS copy
+ vwOneIter(SBCS, R0, len1)
+ B sub4
+copy:
+ MOVD ZR, c+56(FP)
+ CMP R1, R3
+ BEQ done
+copy_4: // no carry flag, copy the rest
+ vwOneIterCopy(R0, done)
+ B copy_4
+
+// func shlVU(z, x []Word, s uint) (c Word)
+// This implementation handles the shift operation from the high word to the low word,
+// which may be an error for the case where the low word of x overlaps with the high
+// word of z. When calling this function directly, you need to pay attention to this
+// situation.
+TEXT ·shlVU(SB),NOSPLIT,$0
+ LDP z+0(FP), (R0, R1) // R0 = z.ptr, R1 = len(z)
+ MOVD x+24(FP), R2
+ MOVD s+48(FP), R3
+ ADD R1<<3, R0 // R0 = &z[n]
+ ADD R1<<3, R2 // R2 = &x[n]
+ CBZ R1, len0
+ CBZ R3, copy // if the number of shift is 0, just copy x to z
+ MOVD $64, R4
+ SUB R3, R4
+ // handling the most significant element x[n-1]
+ MOVD.W -8(R2), R6
+ LSR R4, R6, R5 // return value
+ LSL R3, R6, R8 // x[i] << s
+ SUB $1, R1
+one: TBZ $0, R1, two
+ MOVD.W -8(R2), R6
+ LSR R4, R6, R7
+ ORR R8, R7
+ LSL R3, R6, R8
+ SUB $1, R1
+ MOVD.W R7, -8(R0)
+two:
+ TBZ $1, R1, loop
+ LDP.W -16(R2), (R6, R7)
+ LSR R4, R7, R10
+ ORR R8, R10
+ LSL R3, R7
+ LSR R4, R6, R9
+ ORR R7, R9
+ LSL R3, R6, R8
+ SUB $2, R1
+ STP.W (R9, R10), -16(R0)
+loop:
+ CBZ R1, done
+ LDP.W -32(R2), (R10, R11)
+ LDP 16(R2), (R12, R13)
+ LSR R4, R13, R23
+ ORR R8, R23 // z[i] = (x[i] << s) | (x[i-1] >> (64 - s))
+ LSL R3, R13
+ LSR R4, R12, R22
+ ORR R13, R22
+ LSL R3, R12
+ LSR R4, R11, R21
+ ORR R12, R21
+ LSL R3, R11
+ LSR R4, R10, R20
+ ORR R11, R20
+ LSL R3, R10, R8
+ STP.W (R20, R21), -32(R0)
+ STP (R22, R23), 16(R0)
+ SUB $4, R1
+ B loop
+done:
+ MOVD.W R8, -8(R0) // the first element x[0]
+ MOVD R5, c+56(FP) // the part moved out from x[n-1]
+ RET
+copy:
+ CMP R0, R2
+ BEQ len0
+ TBZ $0, R1, ctwo
+ MOVD.W -8(R2), R4
+ MOVD.W R4, -8(R0)
+ SUB $1, R1
+ctwo:
+ TBZ $1, R1, cloop
+ LDP.W -16(R2), (R4, R5)
+ STP.W (R4, R5), -16(R0)
+ SUB $2, R1
+cloop:
+ CBZ R1, len0
+ LDP.W -32(R2), (R4, R5)
+ LDP 16(R2), (R6, R7)
+ STP.W (R4, R5), -32(R0)
+ STP (R6, R7), 16(R0)
+ SUB $4, R1
+ B cloop
+len0:
+ MOVD $0, c+56(FP)
+ RET
+
+// func shrVU(z, x []Word, s uint) (c Word)
+// This implementation handles the shift operation from the low word to the high word,
+// which may be an error for the case where the high word of x overlaps with the low
+// word of z. When calling this function directly, you need to pay attention to this
+// situation.
+TEXT ·shrVU(SB),NOSPLIT,$0
+ MOVD z+0(FP), R0
+ MOVD z_len+8(FP), R1
+ MOVD x+24(FP), R2
+ MOVD s+48(FP), R3
+ MOVD $0, R8
+ MOVD $64, R4
+ SUB R3, R4
+ CBZ R1, len0
+ CBZ R3, copy // if the number of shift is 0, just copy x to z
+
+ MOVD.P 8(R2), R20
+ LSR R3, R20, R8
+ LSL R4, R20
+ MOVD R20, c+56(FP) // deal with the first element
+ SUB $1, R1
+
+ TBZ $0, R1, two
+ MOVD.P 8(R2), R6
+ LSL R4, R6, R20
+ ORR R8, R20
+ LSR R3, R6, R8
+ MOVD.P R20, 8(R0)
+ SUB $1, R1
+two:
+ TBZ $1, R1, loop
+ LDP.P 16(R2), (R6, R7)
+ LSL R4, R6, R20
+ LSR R3, R6
+ ORR R8, R20
+ LSL R4, R7, R21
+ LSR R3, R7, R8
+ ORR R6, R21
+ STP.P (R20, R21), 16(R0)
+ SUB $2, R1
+loop:
+ CBZ R1, done
+ LDP.P 32(R2), (R10, R11)
+ LDP -16(R2), (R12, R13)
+ LSL R4, R10, R20
+ LSR R3, R10
+ ORR R8, R20 // z[i] = (x[i] >> s) | (x[i+1] << (64 - s))
+ LSL R4, R11, R21
+ LSR R3, R11
+ ORR R10, R21
+ LSL R4, R12, R22
+ LSR R3, R12
+ ORR R11, R22
+ LSL R4, R13, R23
+ LSR R3, R13, R8
+ ORR R12, R23
+ STP.P (R20, R21), 32(R0)
+ STP (R22, R23), -16(R0)
+ SUB $4, R1
+ B loop
+done:
+ MOVD R8, (R0) // deal with the last element
+ RET
+copy:
+ CMP R0, R2
+ BEQ len0
+ TBZ $0, R1, ctwo
+ MOVD.P 8(R2), R3
+ MOVD.P R3, 8(R0)
+ SUB $1, R1
+ctwo:
+ TBZ $1, R1, cloop
+ LDP.P 16(R2), (R4, R5)
+ STP.P (R4, R5), 16(R0)
+ SUB $2, R1
+cloop:
+ CBZ R1, len0
+ LDP.P 32(R2), (R4, R5)
+ LDP -16(R2), (R6, R7)
+ STP.P (R4, R5), 32(R0)
+ STP (R6, R7), -16(R0)
+ SUB $4, R1
+ B cloop
+len0:
+ MOVD $0, c+56(FP)
+ RET
+
+
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
+TEXT ·mulAddVWW(SB),NOSPLIT,$0
+ MOVD z+0(FP), R1
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R2
+ MOVD y+48(FP), R3
+ MOVD r+56(FP), R4
+ // c, z = x * y + r
+ TBZ $0, R0, two
+ MOVD.P 8(R2), R5
+ MUL R3, R5, R7
+ UMULH R3, R5, R8
+ ADDS R4, R7
+ ADC $0, R8, R4 // c, z[i] = x[i] * y + r
+ MOVD.P R7, 8(R1)
+ SUB $1, R0
+two:
+ TBZ $1, R0, loop
+ LDP.P 16(R2), (R5, R6)
+ MUL R3, R5, R10
+ UMULH R3, R5, R11
+ ADDS R4, R10
+ MUL R3, R6, R12
+ UMULH R3, R6, R13
+ ADCS R12, R11
+ ADC $0, R13, R4
+
+ STP.P (R10, R11), 16(R1)
+ SUB $2, R0
+loop:
+ CBZ R0, done
+ LDP.P 32(R2), (R5, R6)
+ LDP -16(R2), (R7, R8)
+
+ MUL R3, R5, R10
+ UMULH R3, R5, R11
+ ADDS R4, R10
+ MUL R3, R6, R12
+ UMULH R3, R6, R13
+ ADCS R11, R12
+
+ MUL R3, R7, R14
+ UMULH R3, R7, R15
+ ADCS R13, R14
+ MUL R3, R8, R16
+ UMULH R3, R8, R17
+ ADCS R15, R16
+ ADC $0, R17, R4
+
+ STP.P (R10, R12), 32(R1)
+ STP (R14, R16), -16(R1)
+ SUB $4, R0
+ B loop
+done:
+ MOVD R4, c+64(FP)
+ RET
+
+
+// func addMulVVW(z, x []Word, y Word) (c Word)
+TEXT ·addMulVVW(SB),NOSPLIT,$0
+ MOVD z+0(FP), R1
+ MOVD z_len+8(FP), R0
+ MOVD x+24(FP), R2
+ MOVD y+48(FP), R3
+ MOVD $0, R4
+
+ TBZ $0, R0, two
+
+ MOVD.P 8(R2), R5
+ MOVD (R1), R6
+
+ MUL R5, R3, R7
+ UMULH R5, R3, R8
+
+ ADDS R7, R6
+ ADC $0, R8, R4
+
+ MOVD.P R6, 8(R1)
+ SUB $1, R0
+
+two:
+ TBZ $1, R0, loop
+
+ LDP.P 16(R2), (R5, R10)
+ LDP (R1), (R6, R11)
+
+ MUL R10, R3, R13
+ UMULH R10, R3, R12
+
+ MUL R5, R3, R7
+ UMULH R5, R3, R8
+
+ ADDS R4, R6
+ ADCS R13, R11
+ ADC $0, R12
+
+ ADDS R7, R6
+ ADCS R8, R11
+ ADC $0, R12, R4
+
+ STP.P (R6, R11), 16(R1)
+ SUB $2, R0
+
+// The main loop of this code operates on a block of 4 words every iteration
+// performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
+// where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
+// 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
+loop:
+ CBZ R0, done
+
+ LDP.P 16(R2), (R5, R6)
+ LDP.P 16(R2), (R7, R8)
+
+ LDP (R1), (R9, R10)
+ ADDS R4, R9
+ MUL R6, R3, R14
+ ADCS R14, R10
+ MUL R7, R3, R15
+ LDP 16(R1), (R11, R12)
+ ADCS R15, R11
+ MUL R8, R3, R16
+ ADCS R16, R12
+ UMULH R8, R3, R20
+ ADC $0, R20
+
+ MUL R5, R3, R13
+ ADDS R13, R9
+ UMULH R5, R3, R17
+ ADCS R17, R10
+ UMULH R6, R3, R21
+ STP.P (R9, R10), 16(R1)
+ ADCS R21, R11
+ UMULH R7, R3, R19
+ ADCS R19, R12
+ STP.P (R11, R12), 16(R1)
+ ADC $0, R20, R4
+
+ SUB $4, R0
+ B loop
+
+done:
+ MOVD R4, c+56(FP)
+ RET
+
+