summaryrefslogtreecommitdiffstats
path: root/src/math/big/arith_amd64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/math/big/arith_amd64.s')
-rw-r--r--src/math/big/arith_amd64.s516
1 files changed, 516 insertions, 0 deletions
diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s
new file mode 100644
index 0000000..b1e914c
--- /dev/null
+++ b/src/math/big/arith_amd64.s
@@ -0,0 +1,516 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !math_big_pure_go
+// +build !math_big_pure_go
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
+// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
+// This is faster than using rotate instructions.
+
+// func addVV(z, x, y []Word) (c Word)
+TEXT ·addVV(SB),NOSPLIT,$0
+ MOVQ z_len+8(FP), DI
+ MOVQ x+24(FP), R8
+ MOVQ y+48(FP), R9
+ MOVQ z+0(FP), R10
+
+ MOVQ $0, CX // c = 0
+ MOVQ $0, SI // i = 0
+
+ // s/JL/JMP/ below to disable the unrolled loop
+ SUBQ $4, DI // n -= 4
+ JL V1 // if n < 0 goto V1
+
+U1: // n >= 0
+ // regular loop body unrolled 4x
+ ADDQ CX, CX // restore CF
+ MOVQ 0(R8)(SI*8), R11
+ MOVQ 8(R8)(SI*8), R12
+ MOVQ 16(R8)(SI*8), R13
+ MOVQ 24(R8)(SI*8), R14
+ ADCQ 0(R9)(SI*8), R11
+ ADCQ 8(R9)(SI*8), R12
+ ADCQ 16(R9)(SI*8), R13
+ ADCQ 24(R9)(SI*8), R14
+ MOVQ R11, 0(R10)(SI*8)
+ MOVQ R12, 8(R10)(SI*8)
+ MOVQ R13, 16(R10)(SI*8)
+ MOVQ R14, 24(R10)(SI*8)
+ SBBQ CX, CX // save CF
+
+ ADDQ $4, SI // i += 4
+ SUBQ $4, DI // n -= 4
+ JGE U1 // if n >= 0 goto U1
+
+V1: ADDQ $4, DI // n += 4
+ JLE E1 // if n <= 0 goto E1
+
+L1: // n > 0
+ ADDQ CX, CX // restore CF
+ MOVQ 0(R8)(SI*8), R11
+ ADCQ 0(R9)(SI*8), R11
+ MOVQ R11, 0(R10)(SI*8)
+ SBBQ CX, CX // save CF
+
+ ADDQ $1, SI // i++
+ SUBQ $1, DI // n--
+ JG L1 // if n > 0 goto L1
+
+E1: NEGQ CX
+ MOVQ CX, c+72(FP) // return c
+ RET
+
+
+// func subVV(z, x, y []Word) (c Word)
+// (same as addVV except for SBBQ instead of ADCQ and label names)
+TEXT ·subVV(SB),NOSPLIT,$0
+ MOVQ z_len+8(FP), DI
+ MOVQ x+24(FP), R8
+ MOVQ y+48(FP), R9
+ MOVQ z+0(FP), R10
+
+ MOVQ $0, CX // c = 0
+ MOVQ $0, SI // i = 0
+
+ // s/JL/JMP/ below to disable the unrolled loop
+ SUBQ $4, DI // n -= 4
+ JL V2 // if n < 0 goto V2
+
+U2: // n >= 0
+ // regular loop body unrolled 4x
+ ADDQ CX, CX // restore CF
+ MOVQ 0(R8)(SI*8), R11
+ MOVQ 8(R8)(SI*8), R12
+ MOVQ 16(R8)(SI*8), R13
+ MOVQ 24(R8)(SI*8), R14
+ SBBQ 0(R9)(SI*8), R11
+ SBBQ 8(R9)(SI*8), R12
+ SBBQ 16(R9)(SI*8), R13
+ SBBQ 24(R9)(SI*8), R14
+ MOVQ R11, 0(R10)(SI*8)
+ MOVQ R12, 8(R10)(SI*8)
+ MOVQ R13, 16(R10)(SI*8)
+ MOVQ R14, 24(R10)(SI*8)
+ SBBQ CX, CX // save CF
+
+ ADDQ $4, SI // i += 4
+ SUBQ $4, DI // n -= 4
+ JGE U2 // if n >= 0 goto U2
+
+V2: ADDQ $4, DI // n += 4
+ JLE E2 // if n <= 0 goto E2
+
+L2: // n > 0
+ ADDQ CX, CX // restore CF
+ MOVQ 0(R8)(SI*8), R11
+ SBBQ 0(R9)(SI*8), R11
+ MOVQ R11, 0(R10)(SI*8)
+ SBBQ CX, CX // save CF
+
+ ADDQ $1, SI // i++
+ SUBQ $1, DI // n--
+ JG L2 // if n > 0 goto L2
+
+E2: NEGQ CX
+ MOVQ CX, c+72(FP) // return c
+ RET
+
+
+// func addVW(z, x []Word, y Word) (c Word)
+TEXT ·addVW(SB),NOSPLIT,$0
+ MOVQ z_len+8(FP), DI
+ CMPQ DI, $32
+ JG large
+ MOVQ x+24(FP), R8
+ MOVQ y+48(FP), CX // c = y
+ MOVQ z+0(FP), R10
+
+ MOVQ $0, SI // i = 0
+
+ // s/JL/JMP/ below to disable the unrolled loop
+ SUBQ $4, DI // n -= 4
+ JL V3 // if n < 4 goto V3
+
+U3: // n >= 0
+ // regular loop body unrolled 4x
+ MOVQ 0(R8)(SI*8), R11
+ MOVQ 8(R8)(SI*8), R12
+ MOVQ 16(R8)(SI*8), R13
+ MOVQ 24(R8)(SI*8), R14
+ ADDQ CX, R11
+ ADCQ $0, R12
+ ADCQ $0, R13
+ ADCQ $0, R14
+ SBBQ CX, CX // save CF
+ NEGQ CX
+ MOVQ R11, 0(R10)(SI*8)
+ MOVQ R12, 8(R10)(SI*8)
+ MOVQ R13, 16(R10)(SI*8)
+ MOVQ R14, 24(R10)(SI*8)
+
+ ADDQ $4, SI // i += 4
+ SUBQ $4, DI // n -= 4
+ JGE U3 // if n >= 0 goto U3
+
+V3: ADDQ $4, DI // n += 4
+ JLE E3 // if n <= 0 goto E3
+
+L3: // n > 0
+ ADDQ 0(R8)(SI*8), CX
+ MOVQ CX, 0(R10)(SI*8)
+ SBBQ CX, CX // save CF
+ NEGQ CX
+
+ ADDQ $1, SI // i++
+ SUBQ $1, DI // n--
+ JG L3 // if n > 0 goto L3
+
+E3: MOVQ CX, c+56(FP) // return c
+ RET
+large:
+ JMP ·addVWlarge(SB)
+
+
+// func subVW(z, x []Word, y Word) (c Word)
+// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
+TEXT ·subVW(SB),NOSPLIT,$0
+ MOVQ z_len+8(FP), DI
+ CMPQ DI, $32
+ JG large
+ MOVQ x+24(FP), R8
+ MOVQ y+48(FP), CX // c = y
+ MOVQ z+0(FP), R10
+
+ MOVQ $0, SI // i = 0
+
+ // s/JL/JMP/ below to disable the unrolled loop
+ SUBQ $4, DI // n -= 4
+ JL V4 // if n < 4 goto V4
+
+U4: // n >= 0
+ // regular loop body unrolled 4x
+ MOVQ 0(R8)(SI*8), R11
+ MOVQ 8(R8)(SI*8), R12
+ MOVQ 16(R8)(SI*8), R13
+ MOVQ 24(R8)(SI*8), R14
+ SUBQ CX, R11
+ SBBQ $0, R12
+ SBBQ $0, R13
+ SBBQ $0, R14
+ SBBQ CX, CX // save CF
+ NEGQ CX
+ MOVQ R11, 0(R10)(SI*8)
+ MOVQ R12, 8(R10)(SI*8)
+ MOVQ R13, 16(R10)(SI*8)
+ MOVQ R14, 24(R10)(SI*8)
+
+ ADDQ $4, SI // i += 4
+ SUBQ $4, DI // n -= 4
+ JGE U4 // if n >= 0 goto U4
+
+V4: ADDQ $4, DI // n += 4
+ JLE E4 // if n <= 0 goto E4
+
+L4: // n > 0
+ MOVQ 0(R8)(SI*8), R11
+ SUBQ CX, R11
+ MOVQ R11, 0(R10)(SI*8)
+ SBBQ CX, CX // save CF
+ NEGQ CX
+
+ ADDQ $1, SI // i++
+ SUBQ $1, DI // n--
+ JG L4 // if n > 0 goto L4
+
+E4: MOVQ CX, c+56(FP) // return c
+ RET
+large:
+ JMP ·subVWlarge(SB)
+
+
+// func shlVU(z, x []Word, s uint) (c Word)
+TEXT ·shlVU(SB),NOSPLIT,$0
+ MOVQ z_len+8(FP), BX // i = z
+ SUBQ $1, BX // i--
+ JL X8b // i < 0 (n <= 0)
+
+ // n > 0
+ MOVQ z+0(FP), R10
+ MOVQ x+24(FP), R8
+ MOVQ s+48(FP), CX
+ MOVQ (R8)(BX*8), AX // w1 = x[n-1]
+ MOVQ $0, DX
+ SHLQ CX, AX, DX // w1>>ŝ
+ MOVQ DX, c+56(FP)
+
+ CMPQ BX, $0
+ JLE X8a // i <= 0
+
+ // i > 0
+L8: MOVQ AX, DX // w = w1
+ MOVQ -8(R8)(BX*8), AX // w1 = x[i-1]
+ SHLQ CX, AX, DX // w<<s | w1>>ŝ
+ MOVQ DX, (R10)(BX*8) // z[i] = w<<s | w1>>ŝ
+ SUBQ $1, BX // i--
+ JG L8 // i > 0
+
+ // i <= 0
+X8a: SHLQ CX, AX // w1<<s
+ MOVQ AX, (R10) // z[0] = w1<<s
+ RET
+
+X8b: MOVQ $0, c+56(FP)
+ RET
+
+
+// func shrVU(z, x []Word, s uint) (c Word)
+TEXT ·shrVU(SB),NOSPLIT,$0
+ MOVQ z_len+8(FP), R11
+ SUBQ $1, R11 // n--
+ JL X9b // n < 0 (n <= 0)
+
+ // n > 0
+ MOVQ z+0(FP), R10
+ MOVQ x+24(FP), R8
+ MOVQ s+48(FP), CX
+ MOVQ (R8), AX // w1 = x[0]
+ MOVQ $0, DX
+ SHRQ CX, AX, DX // w1<<ŝ
+ MOVQ DX, c+56(FP)
+
+ MOVQ $0, BX // i = 0
+ JMP E9
+
+ // i < n-1
+L9: MOVQ AX, DX // w = w1
+ MOVQ 8(R8)(BX*8), AX // w1 = x[i+1]
+ SHRQ CX, AX, DX // w>>s | w1<<ŝ
+ MOVQ DX, (R10)(BX*8) // z[i] = w>>s | w1<<ŝ
+ ADDQ $1, BX // i++
+
+E9: CMPQ BX, R11
+ JL L9 // i < n-1
+
+ // i >= n-1
+X9a: SHRQ CX, AX // w1>>s
+ MOVQ AX, (R10)(R11*8) // z[n-1] = w1>>s
+ RET
+
+X9b: MOVQ $0, c+56(FP)
+ RET
+
+
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
+TEXT ·mulAddVWW(SB),NOSPLIT,$0
+ MOVQ z+0(FP), R10
+ MOVQ x+24(FP), R8
+ MOVQ y+48(FP), R9
+ MOVQ r+56(FP), CX // c = r
+ MOVQ z_len+8(FP), R11
+ MOVQ $0, BX // i = 0
+
+ CMPQ R11, $4
+ JL E5
+
+U5: // i+4 <= n
+ // regular loop body unrolled 4x
+ MOVQ (0*8)(R8)(BX*8), AX
+ MULQ R9
+ ADDQ CX, AX
+ ADCQ $0, DX
+ MOVQ AX, (0*8)(R10)(BX*8)
+ MOVQ DX, CX
+ MOVQ (1*8)(R8)(BX*8), AX
+ MULQ R9
+ ADDQ CX, AX
+ ADCQ $0, DX
+ MOVQ AX, (1*8)(R10)(BX*8)
+ MOVQ DX, CX
+ MOVQ (2*8)(R8)(BX*8), AX
+ MULQ R9
+ ADDQ CX, AX
+ ADCQ $0, DX
+ MOVQ AX, (2*8)(R10)(BX*8)
+ MOVQ DX, CX
+ MOVQ (3*8)(R8)(BX*8), AX
+ MULQ R9
+ ADDQ CX, AX
+ ADCQ $0, DX
+ MOVQ AX, (3*8)(R10)(BX*8)
+ MOVQ DX, CX
+ ADDQ $4, BX // i += 4
+
+ LEAQ 4(BX), DX
+ CMPQ DX, R11
+ JLE U5
+ JMP E5
+
+L5: MOVQ (R8)(BX*8), AX
+ MULQ R9
+ ADDQ CX, AX
+ ADCQ $0, DX
+ MOVQ AX, (R10)(BX*8)
+ MOVQ DX, CX
+ ADDQ $1, BX // i++
+
+E5: CMPQ BX, R11 // i < n
+ JL L5
+
+ MOVQ CX, c+64(FP)
+ RET
+
+
+// func addMulVVW(z, x []Word, y Word) (c Word)
+TEXT ·addMulVVW(SB),NOSPLIT,$0
+ CMPB ·support_adx(SB), $1
+ JEQ adx
+ MOVQ z+0(FP), R10
+ MOVQ x+24(FP), R8
+ MOVQ y+48(FP), R9
+ MOVQ z_len+8(FP), R11
+ MOVQ $0, BX // i = 0
+ MOVQ $0, CX // c = 0
+ MOVQ R11, R12
+ ANDQ $-2, R12
+ CMPQ R11, $2
+ JAE A6
+ JMP E6
+
+A6:
+ MOVQ (R8)(BX*8), AX
+ MULQ R9
+ ADDQ (R10)(BX*8), AX
+ ADCQ $0, DX
+ ADDQ CX, AX
+ ADCQ $0, DX
+ MOVQ DX, CX
+ MOVQ AX, (R10)(BX*8)
+
+ MOVQ (8)(R8)(BX*8), AX
+ MULQ R9
+ ADDQ (8)(R10)(BX*8), AX
+ ADCQ $0, DX
+ ADDQ CX, AX
+ ADCQ $0, DX
+ MOVQ DX, CX
+ MOVQ AX, (8)(R10)(BX*8)
+
+ ADDQ $2, BX
+ CMPQ BX, R12
+ JL A6
+ JMP E6
+
+L6: MOVQ (R8)(BX*8), AX
+ MULQ R9
+ ADDQ CX, AX
+ ADCQ $0, DX
+ ADDQ AX, (R10)(BX*8)
+ ADCQ $0, DX
+ MOVQ DX, CX
+ ADDQ $1, BX // i++
+
+E6: CMPQ BX, R11 // i < n
+ JL L6
+
+ MOVQ CX, c+56(FP)
+ RET
+
+adx:
+ MOVQ z_len+8(FP), R11
+ MOVQ z+0(FP), R10
+ MOVQ x+24(FP), R8
+ MOVQ y+48(FP), DX
+ MOVQ $0, BX // i = 0
+ MOVQ $0, CX // carry
+ CMPQ R11, $8
+ JAE adx_loop_header
+ CMPQ BX, R11
+ JL adx_short
+ MOVQ CX, c+56(FP)
+ RET
+
+adx_loop_header:
+ MOVQ R11, R13
+ ANDQ $-8, R13
+adx_loop:
+ XORQ R9, R9 // unset flags
+ MULXQ (R8), SI, DI
+ ADCXQ CX,SI
+ ADOXQ (R10), SI
+ MOVQ SI,(R10)
+
+ MULXQ 8(R8), AX, CX
+ ADCXQ DI, AX
+ ADOXQ 8(R10), AX
+ MOVQ AX, 8(R10)
+
+ MULXQ 16(R8), SI, DI
+ ADCXQ CX, SI
+ ADOXQ 16(R10), SI
+ MOVQ SI, 16(R10)
+
+ MULXQ 24(R8), AX, CX
+ ADCXQ DI, AX
+ ADOXQ 24(R10), AX
+ MOVQ AX, 24(R10)
+
+ MULXQ 32(R8), SI, DI
+ ADCXQ CX, SI
+ ADOXQ 32(R10), SI
+ MOVQ SI, 32(R10)
+
+ MULXQ 40(R8), AX, CX
+ ADCXQ DI, AX
+ ADOXQ 40(R10), AX
+ MOVQ AX, 40(R10)
+
+ MULXQ 48(R8), SI, DI
+ ADCXQ CX, SI
+ ADOXQ 48(R10), SI
+ MOVQ SI, 48(R10)
+
+ MULXQ 56(R8), AX, CX
+ ADCXQ DI, AX
+ ADOXQ 56(R10), AX
+ MOVQ AX, 56(R10)
+
+ ADCXQ R9, CX
+ ADOXQ R9, CX
+
+ ADDQ $64, R8
+ ADDQ $64, R10
+ ADDQ $8, BX
+
+ CMPQ BX, R13
+ JL adx_loop
+ MOVQ z+0(FP), R10
+ MOVQ x+24(FP), R8
+ CMPQ BX, R11
+ JL adx_short
+ MOVQ CX, c+56(FP)
+ RET
+
+adx_short:
+ MULXQ (R8)(BX*8), SI, DI
+ ADDQ CX, SI
+ ADCQ $0, DI
+ ADDQ SI, (R10)(BX*8)
+ ADCQ $0, DI
+ MOVQ DI, CX
+ ADDQ $1, BX // i++
+
+ CMPQ BX, R11
+ JL adx_short
+
+ MOVQ CX, c+56(FP)
+ RET
+
+
+