summaryrefslogtreecommitdiffstats
path: root/src/crypto/elliptic/p256_asm_s390x.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/elliptic/p256_asm_s390x.s')
-rw-r--r--src/crypto/elliptic/p256_asm_s390x.s2714
1 files changed, 2714 insertions, 0 deletions
diff --git a/src/crypto/elliptic/p256_asm_s390x.s b/src/crypto/elliptic/p256_asm_s390x.s
new file mode 100644
index 0000000..cf37e20
--- /dev/null
+++ b/src/crypto/elliptic/p256_asm_s390x.s
@@ -0,0 +1,2714 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+#include "go_asm.h"
+
+
+DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f
+DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000
+DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff
+DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84
+DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551
+DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256
+DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256
+DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256
+DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256
+DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
+DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
+DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0
+DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0
+DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
+DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0
+DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256
+DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256
+DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256
+DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256
+DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0
+DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0
+DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0
+DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0
+DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1
+DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1
+DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0
+DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0
+DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
+DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0
+DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0
+DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0
+DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256
+DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256
+DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256
+DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256
+GLOBL p256ordK0<>(SB), 8, $4
+GLOBL p256ord<>(SB), 8, $32
+GLOBL p256<>(SB), 8, $80
+GLOBL p256mul<>(SB), 8, $160
+
+DATA p256vmsl<>+0x0(SB)/8, $0x0012131415161718
+DATA p256vmsl<>+0x8(SB)/8, $0x00191a1b1c1d1e1f
+DATA p256vmsl<>+0x10(SB)/8, $0x0012131415161718
+DATA p256vmsl<>+0x18(SB)/8, $0x000b0c0d0e0f1011
+DATA p256vmsl<>+0x20(SB)/8, $0x00191a1b1c1d1e1f
+DATA p256vmsl<>+0x28(SB)/8, $0x0012131415161718
+DATA p256vmsl<>+0x30(SB)/8, $0x000b0c0d0e0f1011
+DATA p256vmsl<>+0x38(SB)/8, $0x0012131415161718
+DATA p256vmsl<>+0x40(SB)/8, $0x000405060708090a
+DATA p256vmsl<>+0x48(SB)/8, $0x000b0c0d0e0f1011
+DATA p256vmsl<>+0x50(SB)/8, $0x000b0c0d0e0f1011
+DATA p256vmsl<>+0x58(SB)/8, $0x000405060708090a
+DATA p256vmsl<>+0x60(SB)/8, $0x1010101000010203
+DATA p256vmsl<>+0x68(SB)/8, $0x100405060708090a
+DATA p256vmsl<>+0x70(SB)/8, $0x100405060708090a
+DATA p256vmsl<>+0x78(SB)/8, $0x1010101000010203
+GLOBL p256vmsl<>(SB), 8, $128
+
+// ---------------------------------------
+// iff cond == 1 val <- -val
+// func p256NegCond(val *p256Point, cond int)
+#define P1ptr R1
+#define CPOOL R4
+
+#define Y1L V0
+#define Y1H V1
+#define T1L V2
+#define T1H V3
+
+#define PL V30
+#define PH V31
+
+#define ZER V4
+#define SEL1 V5
+#define CAR1 V6
+TEXT ·p256NegCond(SB), NOSPLIT, $0
+ MOVD val+0(FP), P1ptr
+
+ MOVD $p256mul<>+0x00(SB), CPOOL
+ VL 16(CPOOL), PL
+ VL 0(CPOOL), PH
+
+ VL 32(P1ptr), Y1H
+ VL 48(P1ptr), Y1L
+
+ VLREPG cond+8(FP), SEL1
+ VZERO ZER
+ VCEQG SEL1, ZER, SEL1
+
+ VSCBIQ Y1L, PL, CAR1
+ VSQ Y1L, PL, T1L
+ VSBIQ PH, Y1H, CAR1, T1H
+
+ VSEL Y1L, T1L, SEL1, Y1L
+ VSEL Y1H, T1H, SEL1, Y1H
+
+ VST Y1H, 32(P1ptr)
+ VST Y1L, 48(P1ptr)
+ RET
+
+#undef P1ptr
+#undef CPOOL
+#undef Y1L
+#undef Y1H
+#undef T1L
+#undef T1H
+#undef PL
+#undef PH
+#undef ZER
+#undef SEL1
+#undef CAR1
+
+// ---------------------------------------
+// if cond == 0 res <- b; else res <- a
+// func p256MovCond(res, a, b *p256Point, cond int)
+#define P3ptr R1
+#define P1ptr R2
+#define P2ptr R3
+
+#define X1L V0
+#define X1H V1
+#define Y1L V2
+#define Y1H V3
+#define Z1L V4
+#define Z1H V5
+#define X2L V6
+#define X2H V7
+#define Y2L V8
+#define Y2H V9
+#define Z2L V10
+#define Z2H V11
+
+#define ZER V18
+#define SEL1 V19
+TEXT ·p256MovCond(SB), NOSPLIT, $0
+ MOVD res+0(FP), P3ptr
+ MOVD a+8(FP), P1ptr
+ MOVD b+16(FP), P2ptr
+ VLREPG cond+24(FP), SEL1
+ VZERO ZER
+ VCEQG SEL1, ZER, SEL1
+
+ VL 0(P1ptr), X1H
+ VL 16(P1ptr), X1L
+ VL 32(P1ptr), Y1H
+ VL 48(P1ptr), Y1L
+ VL 64(P1ptr), Z1H
+ VL 80(P1ptr), Z1L
+
+ VL 0(P2ptr), X2H
+ VL 16(P2ptr), X2L
+ VL 32(P2ptr), Y2H
+ VL 48(P2ptr), Y2L
+ VL 64(P2ptr), Z2H
+ VL 80(P2ptr), Z2L
+
+ VSEL X2L, X1L, SEL1, X1L
+ VSEL X2H, X1H, SEL1, X1H
+ VSEL Y2L, Y1L, SEL1, Y1L
+ VSEL Y2H, Y1H, SEL1, Y1H
+ VSEL Z2L, Z1L, SEL1, Z1L
+ VSEL Z2H, Z1H, SEL1, Z1H
+
+ VST X1H, 0(P3ptr)
+ VST X1L, 16(P3ptr)
+ VST Y1H, 32(P3ptr)
+ VST Y1L, 48(P3ptr)
+ VST Z1H, 64(P3ptr)
+ VST Z1L, 80(P3ptr)
+
+ RET
+
+#undef P3ptr
+#undef P1ptr
+#undef P2ptr
+#undef X1L
+#undef X1H
+#undef Y1L
+#undef Y1H
+#undef Z1L
+#undef Z1H
+#undef X2L
+#undef X2H
+#undef Y2L
+#undef Y2H
+#undef Z2L
+#undef Z2H
+#undef ZER
+#undef SEL1
+
+// ---------------------------------------
+// Constant time table access
+// Indexed from 1 to 15, with -1 offset
+// (index 0 is implicitly point at infinity)
+// func p256Select(point *p256Point, table []p256Point, idx int)
+#define P3ptr R1
+#define P1ptr R2
+#define COUNT R4
+
+#define X1L V0
+#define X1H V1
+#define Y1L V2
+#define Y1H V3
+#define Z1L V4
+#define Z1H V5
+#define X2L V6
+#define X2H V7
+#define Y2L V8
+#define Y2H V9
+#define Z2L V10
+#define Z2H V11
+
+#define ONE V18
+#define IDX V19
+#define SEL1 V20
+#define SEL2 V21
+TEXT ·p256Select(SB), NOSPLIT, $0
+ MOVD point+0(FP), P3ptr
+ MOVD table+8(FP), P1ptr
+ VLREPB idx+(32+7)(FP), IDX
+ VREPIB $1, ONE
+ VREPIB $1, SEL2
+ MOVD $1, COUNT
+
+ VZERO X1H
+ VZERO X1L
+ VZERO Y1H
+ VZERO Y1L
+ VZERO Z1H
+ VZERO Z1L
+
+loop_select:
+ VL 0(P1ptr), X2H
+ VL 16(P1ptr), X2L
+ VL 32(P1ptr), Y2H
+ VL 48(P1ptr), Y2L
+ VL 64(P1ptr), Z2H
+ VL 80(P1ptr), Z2L
+
+ VCEQG SEL2, IDX, SEL1
+
+ VSEL X2L, X1L, SEL1, X1L
+ VSEL X2H, X1H, SEL1, X1H
+ VSEL Y2L, Y1L, SEL1, Y1L
+ VSEL Y2H, Y1H, SEL1, Y1H
+ VSEL Z2L, Z1L, SEL1, Z1L
+ VSEL Z2H, Z1H, SEL1, Z1H
+
+ VAB SEL2, ONE, SEL2
+ ADDW $1, COUNT
+ ADD $96, P1ptr
+ CMPW COUNT, $17
+ BLT loop_select
+
+ VST X1H, 0(P3ptr)
+ VST X1L, 16(P3ptr)
+ VST Y1H, 32(P3ptr)
+ VST Y1L, 48(P3ptr)
+ VST Z1H, 64(P3ptr)
+ VST Z1L, 80(P3ptr)
+ RET
+
+#undef P3ptr
+#undef P1ptr
+#undef COUNT
+#undef X1L
+#undef X1H
+#undef Y1L
+#undef Y1H
+#undef Z1L
+#undef Z1H
+#undef X2L
+#undef X2H
+#undef Y2L
+#undef Y2H
+#undef Z2L
+#undef Z2H
+#undef ONE
+#undef IDX
+#undef SEL1
+#undef SEL2
+
+// ---------------------------------------
+// Constant time table access
+// Indexed from 1 to 15, with -1 offset
+// (index 0 is implicitly point at infinity)
+// func p256SelectBase(point *p256Point, table []p256Point, idx int)
+#define P3ptr R1
+#define P1ptr R2
+#define COUNT R4
+
+#define X1L V0
+#define X1H V1
+#define Y1L V2
+#define Y1H V3
+#define Z1L V4
+#define Z1H V5
+#define X2L V6
+#define X2H V7
+#define Y2L V8
+#define Y2H V9
+#define Z2L V10
+#define Z2H V11
+
+#define ONE V18
+#define IDX V19
+#define SEL1 V20
+#define SEL2 V21
+TEXT ·p256SelectBase(SB), NOSPLIT, $0
+ MOVD point+0(FP), P3ptr
+ MOVD table+8(FP), P1ptr
+ VLREPB idx+(32+7)(FP), IDX
+ VREPIB $1, ONE
+ VREPIB $1, SEL2
+ MOVD $1, COUNT
+
+ VZERO X1H
+ VZERO X1L
+ VZERO Y1H
+ VZERO Y1L
+ VZERO Z1H
+ VZERO Z1L
+
+loop_select:
+ VL 0(P1ptr), X2H
+ VL 16(P1ptr), X2L
+ VL 32(P1ptr), Y2H
+ VL 48(P1ptr), Y2L
+ VL 64(P1ptr), Z2H
+ VL 80(P1ptr), Z2L
+
+ VCEQG SEL2, IDX, SEL1
+
+ VSEL X2L, X1L, SEL1, X1L
+ VSEL X2H, X1H, SEL1, X1H
+ VSEL Y2L, Y1L, SEL1, Y1L
+ VSEL Y2H, Y1H, SEL1, Y1H
+ VSEL Z2L, Z1L, SEL1, Z1L
+ VSEL Z2H, Z1H, SEL1, Z1H
+
+ VAB SEL2, ONE, SEL2
+ ADDW $1, COUNT
+ ADD $96, P1ptr
+ CMPW COUNT, $65
+ BLT loop_select
+
+ VST X1H, 0(P3ptr)
+ VST X1L, 16(P3ptr)
+ VST Y1H, 32(P3ptr)
+ VST Y1L, 48(P3ptr)
+ VST Z1H, 64(P3ptr)
+ VST Z1L, 80(P3ptr)
+ RET
+
+#undef P3ptr
+#undef P1ptr
+#undef COUNT
+#undef X1L
+#undef X1H
+#undef Y1L
+#undef Y1H
+#undef Z1L
+#undef Z1H
+#undef X2L
+#undef X2H
+#undef Y2L
+#undef Y2H
+#undef Z2L
+#undef Z2H
+#undef ONE
+#undef IDX
+#undef SEL1
+#undef SEL2
+
+// ---------------------------------------
+// func p256FromMont(res, in []byte)
+#define res_ptr R1
+#define x_ptr R2
+#define CPOOL R4
+
+#define T0 V0
+#define T1 V1
+#define T2 V2
+#define TT0 V3
+#define TT1 V4
+
+#define ZER V6
+#define SEL1 V7
+#define SEL2 V8
+#define CAR1 V9
+#define CAR2 V10
+#define RED1 V11
+#define RED2 V12
+#define PL V13
+#define PH V14
+
+TEXT ·p256FromMont(SB), NOSPLIT, $0
+ MOVD res+0(FP), res_ptr
+ MOVD in+24(FP), x_ptr
+
+ VZERO T2
+ VZERO ZER
+ MOVD $p256<>+0x00(SB), CPOOL
+ VL 16(CPOOL), PL
+ VL 0(CPOOL), PH
+ VL 48(CPOOL), SEL2
+ VL 64(CPOOL), SEL1
+
+ VL (1*16)(x_ptr), T0
+ VL (0*16)(x_ptr), T1
+
+ // First round
+ VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
+ VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
+ VSQ RED1, RED2, RED2 // Guaranteed not to underflow
+
+ VSLDB $8, T1, T0, T0
+ VSLDB $8, T2, T1, T1
+
+ VACCQ T0, RED1, CAR1
+ VAQ T0, RED1, T0
+ VACCCQ T1, RED2, CAR1, CAR2
+ VACQ T1, RED2, CAR1, T1
+ VAQ T2, CAR2, T2
+
+ // Second round
+ VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
+ VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
+ VSQ RED1, RED2, RED2 // Guaranteed not to underflow
+
+ VSLDB $8, T1, T0, T0
+ VSLDB $8, T2, T1, T1
+
+ VACCQ T0, RED1, CAR1
+ VAQ T0, RED1, T0
+ VACCCQ T1, RED2, CAR1, CAR2
+ VACQ T1, RED2, CAR1, T1
+ VAQ T2, CAR2, T2
+
+ // Third round
+ VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
+ VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
+ VSQ RED1, RED2, RED2 // Guaranteed not to underflow
+
+ VSLDB $8, T1, T0, T0
+ VSLDB $8, T2, T1, T1
+
+ VACCQ T0, RED1, CAR1
+ VAQ T0, RED1, T0
+ VACCCQ T1, RED2, CAR1, CAR2
+ VACQ T1, RED2, CAR1, T1
+ VAQ T2, CAR2, T2
+
+ // Last round
+ VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0
+ VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0
+ VSQ RED1, RED2, RED2 // Guaranteed not to underflow
+
+ VSLDB $8, T1, T0, T0
+ VSLDB $8, T2, T1, T1
+
+ VACCQ T0, RED1, CAR1
+ VAQ T0, RED1, T0
+ VACCCQ T1, RED2, CAR1, CAR2
+ VACQ T1, RED2, CAR1, T1
+ VAQ T2, CAR2, T2
+
+ // ---------------------------------------------------
+
+ VSCBIQ PL, T0, CAR1
+ VSQ PL, T0, TT0
+ VSBCBIQ T1, PH, CAR1, CAR2
+ VSBIQ T1, PH, CAR1, TT1
+ VSBIQ T2, ZER, CAR2, T2
+
+ // what output to use, TT1||TT0 or T1||T0?
+ VSEL T0, TT0, T2, T0
+ VSEL T1, TT1, T2, T1
+
+ VST T0, (1*16)(res_ptr)
+ VST T1, (0*16)(res_ptr)
+ RET
+
+#undef res_ptr
+#undef x_ptr
+#undef CPOOL
+#undef T0
+#undef T1
+#undef T2
+#undef TT0
+#undef TT1
+#undef ZER
+#undef SEL1
+#undef SEL2
+#undef CAR1
+#undef CAR2
+#undef RED1
+#undef RED2
+#undef PL
+#undef PH
+
+// ---------------------------------------
+// func p256OrdMul(res, in1, in2 []byte)
+#define res_ptr R1
+#define x_ptr R2
+#define y_ptr R3
+#define X0 V0
+#define X1 V1
+#define Y0 V2
+#define Y1 V3
+#define M0 V4
+#define M1 V5
+#define T0 V6
+#define T1 V7
+#define T2 V8
+#define YDIG V9
+
+#define ADD1 V16
+#define ADD1H V17
+#define ADD2 V18
+#define ADD2H V19
+#define RED1 V20
+#define RED1H V21
+#define RED2 V22
+#define RED2H V23
+#define CAR1 V24
+#define CAR1M V25
+
+#define MK0 V30
+#define K0 V31
+TEXT ·p256OrdMul(SB), NOSPLIT, $0
+ MOVD res+0(FP), res_ptr
+ MOVD in1+24(FP), x_ptr
+ MOVD in2+48(FP), y_ptr
+
+ VZERO T2
+ MOVD $p256ordK0<>+0x00(SB), R4
+
+ // VLEF $3, 0(R4), K0
+ WORD $0xE7F40000
+ BYTE $0x38
+ BYTE $0x03
+ MOVD $p256ord<>+0x00(SB), R4
+ VL 16(R4), M0
+ VL 0(R4), M1
+
+ VL (1*16)(x_ptr), X0
+ VL (0*16)(x_ptr), X1
+ VL (1*16)(y_ptr), Y0
+ VL (0*16)(y_ptr), Y1
+
+ // ---------------------------------------------------------------------------/
+ VREPF $3, Y0, YDIG
+ VMLF X0, YDIG, ADD1
+ VMLF ADD1, K0, MK0
+ VREPF $3, MK0, MK0
+
+ VMLF X1, YDIG, ADD2
+ VMLHF X0, YDIG, ADD1H
+ VMLHF X1, YDIG, ADD2H
+
+ VMALF M0, MK0, ADD1, RED1
+ VMALHF M0, MK0, ADD1, RED1H
+ VMALF M1, MK0, ADD2, RED2
+ VMALHF M1, MK0, ADD2, RED2H
+
+ VSLDB $12, RED2, RED1, RED1
+ VSLDB $12, T2, RED2, RED2
+
+ VACCQ RED1, ADD1H, CAR1
+ VAQ RED1, ADD1H, T0
+ VACCQ RED1H, T0, CAR1M
+ VAQ RED1H, T0, T0
+
+ // << ready for next MK0
+
+ VACQ RED2, ADD2H, CAR1, T1
+ VACCCQ RED2, ADD2H, CAR1, CAR1
+ VACCCQ RED2H, T1, CAR1M, T2
+ VACQ RED2H, T1, CAR1M, T1
+ VAQ CAR1, T2, T2
+
+ // ---------------------------------------------------
+/* *
+ * ---+--------+--------+
+ * T2| T1 | T0 |
+ * ---+--------+--------+
+ * *(add)*
+ * +--------+--------+
+ * | X1 | X0 |
+ * +--------+--------+
+ * *(mul)*
+ * +--------+--------+
+ * | YDIG | YDIG |
+ * +--------+--------+
+ * *(add)*
+ * +--------+--------+
+ * | M1 | M0 |
+ * +--------+--------+
+ * *(mul)*
+ * +--------+--------+
+ * | MK0 | MK0 |
+ * +--------+--------+
+ *
+ * ---------------------
+ *
+ * +--------+--------+
+ * | ADD2 | ADD1 |
+ * +--------+--------+
+ * +--------+--------+
+ * | ADD2H | ADD1H |
+ * +--------+--------+
+ * +--------+--------+
+ * | RED2 | RED1 |
+ * +--------+--------+
+ * +--------+--------+
+ * | RED2H | RED1H |
+ * +--------+--------+
+ */
+ VREPF $2, Y0, YDIG
+ VMALF X0, YDIG, T0, ADD1
+ VMLF ADD1, K0, MK0
+ VREPF $3, MK0, MK0
+
+ VMALF X1, YDIG, T1, ADD2
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+
+ VMALF M0, MK0, ADD1, RED1
+ VMALHF M0, MK0, ADD1, RED1H
+ VMALF M1, MK0, ADD2, RED2
+ VMALHF M1, MK0, ADD2, RED2H
+
+ VSLDB $12, RED2, RED1, RED1
+ VSLDB $12, T2, RED2, RED2
+
+ VACCQ RED1, ADD1H, CAR1
+ VAQ RED1, ADD1H, T0
+ VACCQ RED1H, T0, CAR1M
+ VAQ RED1H, T0, T0
+
+ // << ready for next MK0
+
+ VACQ RED2, ADD2H, CAR1, T1
+ VACCCQ RED2, ADD2H, CAR1, CAR1
+ VACCCQ RED2H, T1, CAR1M, T2
+ VACQ RED2H, T1, CAR1M, T1
+ VAQ CAR1, T2, T2
+
+ // ---------------------------------------------------
+ VREPF $1, Y0, YDIG
+ VMALF X0, YDIG, T0, ADD1
+ VMLF ADD1, K0, MK0
+ VREPF $3, MK0, MK0
+
+ VMALF X1, YDIG, T1, ADD2
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+
+ VMALF M0, MK0, ADD1, RED1
+ VMALHF M0, MK0, ADD1, RED1H
+ VMALF M1, MK0, ADD2, RED2
+ VMALHF M1, MK0, ADD2, RED2H
+
+ VSLDB $12, RED2, RED1, RED1
+ VSLDB $12, T2, RED2, RED2
+
+ VACCQ RED1, ADD1H, CAR1
+ VAQ RED1, ADD1H, T0
+ VACCQ RED1H, T0, CAR1M
+ VAQ RED1H, T0, T0
+
+ // << ready for next MK0
+
+ VACQ RED2, ADD2H, CAR1, T1
+ VACCCQ RED2, ADD2H, CAR1, CAR1
+ VACCCQ RED2H, T1, CAR1M, T2
+ VACQ RED2H, T1, CAR1M, T1
+ VAQ CAR1, T2, T2
+
+ // ---------------------------------------------------
+ VREPF $0, Y0, YDIG
+ VMALF X0, YDIG, T0, ADD1
+ VMLF ADD1, K0, MK0
+ VREPF $3, MK0, MK0
+
+ VMALF X1, YDIG, T1, ADD2
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+
+ VMALF M0, MK0, ADD1, RED1
+ VMALHF M0, MK0, ADD1, RED1H
+ VMALF M1, MK0, ADD2, RED2
+ VMALHF M1, MK0, ADD2, RED2H
+
+ VSLDB $12, RED2, RED1, RED1
+ VSLDB $12, T2, RED2, RED2
+
+ VACCQ RED1, ADD1H, CAR1
+ VAQ RED1, ADD1H, T0
+ VACCQ RED1H, T0, CAR1M
+ VAQ RED1H, T0, T0
+
+ // << ready for next MK0
+
+ VACQ RED2, ADD2H, CAR1, T1
+ VACCCQ RED2, ADD2H, CAR1, CAR1
+ VACCCQ RED2H, T1, CAR1M, T2
+ VACQ RED2H, T1, CAR1M, T1
+ VAQ CAR1, T2, T2
+
+ // ---------------------------------------------------
+ VREPF $3, Y1, YDIG
+ VMALF X0, YDIG, T0, ADD1
+ VMLF ADD1, K0, MK0
+ VREPF $3, MK0, MK0
+
+ VMALF X1, YDIG, T1, ADD2
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+
+ VMALF M0, MK0, ADD1, RED1
+ VMALHF M0, MK0, ADD1, RED1H
+ VMALF M1, MK0, ADD2, RED2
+ VMALHF M1, MK0, ADD2, RED2H
+
+ VSLDB $12, RED2, RED1, RED1
+ VSLDB $12, T2, RED2, RED2
+
+ VACCQ RED1, ADD1H, CAR1
+ VAQ RED1, ADD1H, T0
+ VACCQ RED1H, T0, CAR1M
+ VAQ RED1H, T0, T0
+
+ // << ready for next MK0
+
+ VACQ RED2, ADD2H, CAR1, T1
+ VACCCQ RED2, ADD2H, CAR1, CAR1
+ VACCCQ RED2H, T1, CAR1M, T2
+ VACQ RED2H, T1, CAR1M, T1
+ VAQ CAR1, T2, T2
+
+ // ---------------------------------------------------
+ VREPF $2, Y1, YDIG
+ VMALF X0, YDIG, T0, ADD1
+ VMLF ADD1, K0, MK0
+ VREPF $3, MK0, MK0
+
+ VMALF X1, YDIG, T1, ADD2
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+
+ VMALF M0, MK0, ADD1, RED1
+ VMALHF M0, MK0, ADD1, RED1H
+ VMALF M1, MK0, ADD2, RED2
+ VMALHF M1, MK0, ADD2, RED2H
+
+ VSLDB $12, RED2, RED1, RED1
+ VSLDB $12, T2, RED2, RED2
+
+ VACCQ RED1, ADD1H, CAR1
+ VAQ RED1, ADD1H, T0
+ VACCQ RED1H, T0, CAR1M
+ VAQ RED1H, T0, T0
+
+ // << ready for next MK0
+
+ VACQ RED2, ADD2H, CAR1, T1
+ VACCCQ RED2, ADD2H, CAR1, CAR1
+ VACCCQ RED2H, T1, CAR1M, T2
+ VACQ RED2H, T1, CAR1M, T1
+ VAQ CAR1, T2, T2
+
+ // ---------------------------------------------------
+ VREPF $1, Y1, YDIG
+ VMALF X0, YDIG, T0, ADD1
+ VMLF ADD1, K0, MK0
+ VREPF $3, MK0, MK0
+
+ VMALF X1, YDIG, T1, ADD2
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+
+ VMALF M0, MK0, ADD1, RED1
+ VMALHF M0, MK0, ADD1, RED1H
+ VMALF M1, MK0, ADD2, RED2
+ VMALHF M1, MK0, ADD2, RED2H
+
+ VSLDB $12, RED2, RED1, RED1
+ VSLDB $12, T2, RED2, RED2
+
+ VACCQ RED1, ADD1H, CAR1
+ VAQ RED1, ADD1H, T0
+ VACCQ RED1H, T0, CAR1M
+ VAQ RED1H, T0, T0
+
+ // << ready for next MK0
+
+ VACQ RED2, ADD2H, CAR1, T1
+ VACCCQ RED2, ADD2H, CAR1, CAR1
+ VACCCQ RED2H, T1, CAR1M, T2
+ VACQ RED2H, T1, CAR1M, T1
+ VAQ CAR1, T2, T2
+
+ // ---------------------------------------------------
+ VREPF $0, Y1, YDIG
+ VMALF X0, YDIG, T0, ADD1
+ VMLF ADD1, K0, MK0
+ VREPF $3, MK0, MK0
+
+ VMALF X1, YDIG, T1, ADD2
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+
+ VMALF M0, MK0, ADD1, RED1
+ VMALHF M0, MK0, ADD1, RED1H
+ VMALF M1, MK0, ADD2, RED2
+ VMALHF M1, MK0, ADD2, RED2H
+
+ VSLDB $12, RED2, RED1, RED1
+ VSLDB $12, T2, RED2, RED2
+
+ VACCQ RED1, ADD1H, CAR1
+ VAQ RED1, ADD1H, T0
+ VACCQ RED1H, T0, CAR1M
+ VAQ RED1H, T0, T0
+
+ // << ready for next MK0
+
+ VACQ RED2, ADD2H, CAR1, T1
+ VACCCQ RED2, ADD2H, CAR1, CAR1
+ VACCCQ RED2H, T1, CAR1M, T2
+ VACQ RED2H, T1, CAR1M, T1
+ VAQ CAR1, T2, T2
+
+ // ---------------------------------------------------
+
+ VZERO RED1
+ VSCBIQ M0, T0, CAR1
+ VSQ M0, T0, ADD1
+ VSBCBIQ T1, M1, CAR1, CAR1M
+ VSBIQ T1, M1, CAR1, ADD2
+ VSBIQ T2, RED1, CAR1M, T2
+
+ // what output to use, ADD2||ADD1 or T1||T0?
+ VSEL T0, ADD1, T2, T0
+ VSEL T1, ADD2, T2, T1
+
+ VST T0, (1*16)(res_ptr)
+ VST T1, (0*16)(res_ptr)
+ RET
+
+#undef res_ptr
+#undef x_ptr
+#undef y_ptr
+#undef X0
+#undef X1
+#undef Y0
+#undef Y1
+#undef M0
+#undef M1
+#undef T0
+#undef T1
+#undef T2
+#undef YDIG
+
+#undef ADD1
+#undef ADD1H
+#undef ADD2
+#undef ADD2H
+#undef RED1
+#undef RED1H
+#undef RED2
+#undef RED2H
+#undef CAR1
+#undef CAR1M
+
+#undef MK0
+#undef K0
+
+// ---------------------------------------
+// p256MulInternalVX
+// V0-V3,V30,V31 - Not Modified
+// V4-V15 - Volatile
+
+#define CPOOL R4
+
+// Parameters
+#define X0 V0 // Not modified
+#define X1 V1 // Not modified
+#define Y0 V2 // Not modified
+#define Y1 V3 // Not modified
+#define T0 V4
+#define T1 V5
+#define P0 V30 // Not modified
+#define P1 V31 // Not modified
+
+// Temporaries
+#define YDIG V6 // Overloaded with CAR2, ZER
+#define ADD1H V7 // Overloaded with ADD3H
+#define ADD2H V8 // Overloaded with ADD4H
+#define ADD3 V9 // Overloaded with SEL2,SEL5
+#define ADD4 V10 // Overloaded with SEL3,SEL6
+#define RED1 V11 // Overloaded with CAR2
+#define RED2 V12
+#define RED3 V13 // Overloaded with SEL1
+#define T2 V14
+// Overloaded temporaries
+#define ADD1 V4 // Overloaded with T0
+#define ADD2 V5 // Overloaded with T1
+#define ADD3H V7 // Overloaded with ADD1H
+#define ADD4H V8 // Overloaded with ADD2H
+#define ZER V6 // Overloaded with YDIG, CAR2
+#define CAR1 V6 // Overloaded with YDIG, ZER
+#define CAR2 V11 // Overloaded with RED1
+// Constant Selects
+#define SEL1 V13 // Overloaded with RED3
+#define SEL2 V9 // Overloaded with ADD3,SEL5
+#define SEL3 V10 // Overloaded with ADD4,SEL6
+#define SEL4 V6 // Overloaded with YDIG,CAR2,ZER
+#define SEL5 V9 // Overloaded with ADD3,SEL2
+#define SEL6 V10 // Overloaded with ADD4,SEL3
+
+/* *
+ * To follow the flow of bits, for your own sanity a stiff drink, need you shall.
+ * Of a single round, a 'helpful' picture, here is. Meaning, column position has.
+ * With you, SIMD be...
+ *
+ * +--------+--------+
+ * +--------| RED2 | RED1 |
+ * | +--------+--------+
+ * | ---+--------+--------+
+ * | +---- T2| T1 | T0 |--+
+ * | | ---+--------+--------+ |
+ * | | |
+ * | | ======================= |
+ * | | |
+ * | | +--------+--------+<-+
+ * | +-------| ADD2 | ADD1 |--|-----+
+ * | | +--------+--------+ | |
+ * | | +--------+--------+<---+ |
+ * | | | ADD2H | ADD1H |--+ |
+ * | | +--------+--------+ | |
+ * | | +--------+--------+<-+ |
+ * | | | ADD4 | ADD3 |--|-+ |
+ * | | +--------+--------+ | | |
+ * | | +--------+--------+<---+ | |
+ * | | | ADD4H | ADD3H |------|-+ |(+vzero)
+ * | | +--------+--------+ | | V
+ * | | ------------------------ | | +--------+
+ * | | | | | RED3 | [d0 0 0 d0]
+ * | | | | +--------+
+ * | +---->+--------+--------+ | | |
+ * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | |
+ * | +--------+--------+ | | |
+ * +---->---+--------+--------+ | | |
+ * T2| T1 | T0 |----+ | |
+ * ---+--------+--------+ | | |
+ * ---+--------+--------+<---+ | |
+ * +--- T2| T1 | T0 |----------+
+ * | ---+--------+--------+ | |
+ * | +--------+--------+<-------------+
+ * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
+ * | +--------+--------+ | | |
+ * | +--------+<----------------------+
+ * | | RED3 |--------------+ | [0 0 d1 d0]
+ * | +--------+ | |
+ * +--->+--------+--------+ | |
+ * | T1 | T0 |--------+
+ * +--------+--------+ | |
+ * --------------------------- | |
+ * | |
+ * +--------+--------+<----+ |
+ * | RED2 | RED1 | |
+ * +--------+--------+ |
+ * ---+--------+--------+<-------+
+ * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
+ * ---+--------+--------+
+ *
+ * *Mi obra de arte de siglo XXI @vpaprots
+ *
+ *
+ * First group is special, doesn't get the two inputs:
+ * +--------+--------+<-+
+ * +-------| ADD2 | ADD1 |--|-----+
+ * | +--------+--------+ | |
+ * | +--------+--------+<---+ |
+ * | | ADD2H | ADD1H |--+ |
+ * | +--------+--------+ | |
+ * | +--------+--------+<-+ |
+ * | | ADD4 | ADD3 |--|-+ |
+ * | +--------+--------+ | | |
+ * | +--------+--------+<---+ | |
+ * | | ADD4H | ADD3H |------|-+ |(+vzero)
+ * | +--------+--------+ | | V
+ * | ------------------------ | | +--------+
+ * | | | | RED3 | [d0 0 0 d0]
+ * | | | +--------+
+ * +---->+--------+--------+ | | |
+ * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | |
+ * +--------+--------+ | | |
+ * ---+--------+--------+<---+ | |
+ * +--- T2| T1 | T0 |----------+
+ * | ---+--------+--------+ | |
+ * | +--------+--------+<-------------+
+ * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0]
+ * | +--------+--------+ | | |
+ * | +--------+<----------------------+
+ * | | RED3 |--------------+ | [0 0 d1 d0]
+ * | +--------+ | |
+ * +--->+--------+--------+ | |
+ * | T1 | T0 |--------+
+ * +--------+--------+ | |
+ * --------------------------- | |
+ * | |
+ * +--------+--------+<----+ |
+ * | RED2 | RED1 | |
+ * +--------+--------+ |
+ * ---+--------+--------+<-------+
+ * T2| T1 | T0 | (H1P-H1P-H00RRAY!)
+ * ---+--------+--------+
+ *
+ * Last 'group' needs to RED2||RED1 shifted less
+ */
+TEXT ·p256MulInternalVX(SB), NOSPLIT, $0-0
+ VL 32(CPOOL), SEL1
+ VL 48(CPOOL), SEL2
+ VL 64(CPOOL), SEL3
+ VL 80(CPOOL), SEL4
+
+ // ---------------------------------------------------
+
+ VREPF $3, Y0, YDIG
+ VMLHF X0, YDIG, ADD1H
+ VMLHF X1, YDIG, ADD2H
+ VMLF X0, YDIG, ADD1
+ VMLF X1, YDIG, ADD2
+
+ VREPF $2, Y0, YDIG
+ VMALF X0, YDIG, ADD1H, ADD3
+ VMALF X1, YDIG, ADD2H, ADD4
+ VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
+ VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
+
+ VZERO ZER
+ VL 32(CPOOL), SEL1
+ VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
+
+ VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
+ VSLDB $12, ZER, ADD2, T1 // ADD2 Free
+
+ VACCQ T0, ADD3, CAR1
+ VAQ T0, ADD3, T0 // ADD3 Free
+ VACCCQ T1, ADD4, CAR1, T2
+ VACQ T1, ADD4, CAR1, T1 // ADD4 Free
+
+ VL 48(CPOOL), SEL2
+ VL 64(CPOOL), SEL3
+ VL 80(CPOOL), SEL4
+ VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
+ VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
+ VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
+ VSQ RED3, RED2, RED2 // Guaranteed not to underflow
+
+ VSLDB $12, T1, T0, T0
+ VSLDB $12, T2, T1, T1
+
+ VACCQ T0, ADD3H, CAR1
+ VAQ T0, ADD3H, T0
+ VACCCQ T1, ADD4H, CAR1, T2
+ VACQ T1, ADD4H, CAR1, T1
+
+ // ---------------------------------------------------
+
+ VREPF $1, Y0, YDIG
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+ VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1
+ VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2
+
+ VREPF $0, Y0, YDIG
+ VMALF X0, YDIG, ADD1H, ADD3
+ VMALF X1, YDIG, ADD2H, ADD4
+ VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H
+ VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER
+
+ VZERO ZER
+ VL 32(CPOOL), SEL1
+ VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
+
+ VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0
+ VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free
+
+ VACCQ T0, RED1, CAR1
+ VAQ T0, RED1, T0
+ VACCCQ T1, RED2, CAR1, T2
+ VACQ T1, RED2, CAR1, T1
+
+ VACCQ T0, ADD3, CAR1
+ VAQ T0, ADD3, T0
+ VACCCQ T1, ADD4, CAR1, CAR2
+ VACQ T1, ADD4, CAR1, T1
+ VAQ T2, CAR2, T2
+
+ VL 48(CPOOL), SEL2
+ VL 64(CPOOL), SEL3
+ VL 80(CPOOL), SEL4
+ VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
+ VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
+ VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
+ VSQ RED3, RED2, RED2 // Guaranteed not to underflow
+
+ VSLDB $12, T1, T0, T0
+ VSLDB $12, T2, T1, T1
+
+ VACCQ T0, ADD3H, CAR1
+ VAQ T0, ADD3H, T0
+ VACCCQ T1, ADD4H, CAR1, T2
+ VACQ T1, ADD4H, CAR1, T1
+
+ // ---------------------------------------------------
+
+ VREPF $3, Y1, YDIG
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+ VMALF X0, YDIG, T0, ADD1
+ VMALF X1, YDIG, T1, ADD2
+
+ VREPF $2, Y1, YDIG
+ VMALF X0, YDIG, ADD1H, ADD3
+ VMALF X1, YDIG, ADD2H, ADD4
+ VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free
+ VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free
+
+ VZERO ZER
+ VL 32(CPOOL), SEL1
+ VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
+
+ VSLDB $12, ADD2, ADD1, T0 // ADD1 Free
+ VSLDB $12, T2, ADD2, T1 // ADD2 Free
+
+ VACCQ T0, RED1, CAR1
+ VAQ T0, RED1, T0
+ VACCCQ T1, RED2, CAR1, T2
+ VACQ T1, RED2, CAR1, T1
+
+ VACCQ T0, ADD3, CAR1
+ VAQ T0, ADD3, T0
+ VACCCQ T1, ADD4, CAR1, CAR2
+ VACQ T1, ADD4, CAR1, T1
+ VAQ T2, CAR2, T2
+
+ VL 48(CPOOL), SEL2
+ VL 64(CPOOL), SEL3
+ VL 80(CPOOL), SEL4
+ VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0]
+ VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1]
+ VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0]
+ VSQ RED3, RED2, RED2 // Guaranteed not to underflow
+
+ VSLDB $12, T1, T0, T0
+ VSLDB $12, T2, T1, T1
+
+ VACCQ T0, ADD3H, CAR1
+ VAQ T0, ADD3H, T0
+ VACCCQ T1, ADD4H, CAR1, T2
+ VACQ T1, ADD4H, CAR1, T1
+
+ // ---------------------------------------------------
+
+ VREPF $1, Y1, YDIG
+ VMALHF X0, YDIG, T0, ADD1H
+ VMALHF X1, YDIG, T1, ADD2H
+ VMALF X0, YDIG, T0, ADD1
+ VMALF X1, YDIG, T1, ADD2
+
+ VREPF $0, Y1, YDIG
+ VMALF X0, YDIG, ADD1H, ADD3
+ VMALF X1, YDIG, ADD2H, ADD4
+ VMALHF X0, YDIG, ADD1H, ADD3H
+ VMALHF X1, YDIG, ADD2H, ADD4H
+
+ VZERO ZER
+ VL 32(CPOOL), SEL1
+ VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0]
+
+ VSLDB $12, ADD2, ADD1, T0
+ VSLDB $12, T2, ADD2, T1
+
+ VACCQ T0, RED1, CAR1
+ VAQ T0, RED1, T0
+ VACCCQ T1, RED2, CAR1, T2
+ VACQ T1, RED2, CAR1, T1
+
+ VACCQ T0, ADD3, CAR1
+ VAQ T0, ADD3, T0
+ VACCCQ T1, ADD4, CAR1, CAR2
+ VACQ T1, ADD4, CAR1, T1
+ VAQ T2, CAR2, T2
+
+ VL 96(CPOOL), SEL5
+ VL 112(CPOOL), SEL6
+ VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0]
+ VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0]
+ VSQ RED1, RED2, RED2 // Guaranteed not to underflow
+
+ VSLDB $12, T1, T0, T0
+ VSLDB $12, T2, T1, T1
+
+ VACCQ T0, ADD3H, CAR1
+ VAQ T0, ADD3H, T0
+ VACCCQ T1, ADD4H, CAR1, T2
+ VACQ T1, ADD4H, CAR1, T1
+
+ VACCQ T0, RED1, CAR1
+ VAQ T0, RED1, T0
+ VACCCQ T1, RED2, CAR1, CAR2
+ VACQ T1, RED2, CAR1, T1
+ VAQ T2, CAR2, T2
+
+ // ---------------------------------------------------
+
+ VZERO RED3
+ VSCBIQ P0, T0, CAR1
+ VSQ P0, T0, ADD1H
+ VSBCBIQ T1, P1, CAR1, CAR2
+ VSBIQ T1, P1, CAR1, ADD2H
+ VSBIQ T2, RED3, CAR2, T2
+
+ // what output to use, ADD2H||ADD1H or T1||T0?
+ VSEL T0, ADD1H, T2, T0
+ VSEL T1, ADD2H, T2, T1
+ RET
+
+#undef CPOOL
+
+#undef X0
+#undef X1
+#undef Y0
+#undef Y1
+#undef T0
+#undef T1
+#undef P0
+#undef P1
+
+#undef SEL1
+#undef SEL2
+#undef SEL3
+#undef SEL4
+#undef SEL5
+#undef SEL6
+
+#undef YDIG
+#undef ADD1H
+#undef ADD2H
+#undef ADD3
+#undef ADD4
+#undef RED1
+#undef RED2
+#undef RED3
+#undef T2
+#undef ADD1
+#undef ADD2
+#undef ADD3H
+#undef ADD4H
+#undef ZER
+#undef CAR1
+#undef CAR2
+
+// ---------------------------------------
+// p256MulInternalVMSL
+// V0-V3,V30,V31 - Not Modified
+// V4-V14 - Volatile
+
+#define CPOOL R4
+#define SCRATCH R9
+
+// Parameters
+#define X0 V0 // Not modified
+#define X1 V1 // Not modified
+#define Y0 V2 // Not modified
+#define Y1 V3 // Not modified
+#define T0 V4
+#define T1 V5
+#define T2 V6
+#define P0 V30 // Not modified
+#define P1 V31 // Not modified
+
+// input: d0
+// output: h0, h1
+// temp: TEMP, ZERO, BORROW
+#define OBSERVATION3(d0, h0, h1, TEMP, ZERO, BORROW) \
+ VZERO ZERO \
+ VSLDB $4, d0, ZERO, h0 \
+ VLR h0, BORROW \
+ VSLDB $12, ZERO, h0, TEMP \
+ VSQ TEMP, h0, h0 \
+ VSLDB $12, d0, BORROW, h1 \
+ VSLDB $8, ZERO, BORROW, TEMP \
+ VAQ TEMP, h0, h0 \
+
+#define OBSERVATION3A(d2, h0, h1, TEMP, ZERO) \
+ VZERO ZERO \
+ VSLDB $8, d2, ZERO, TEMP \
+ VSLDB $8, d2, TEMP, h0 \
+ VSLDB $12, ZERO, TEMP, h1 \
+ VSQ h1, h0, h0 \
+
+TEXT ·p256MulInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
+ VSTM V16, V19, (SCRATCH)
+
+ MOVD $p256vmsl<>+0x00(SB), CPOOL
+
+ // Divide input1 into 5 limbs
+ VGBM $0x007f, V14
+ VZERO V12
+ VSLDB $2, X1, X0, V13
+ VSLDB $2, Y1, Y0, V8
+ VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
+ VSLDB $4, V12, Y1, V6 // V6: 4 bytes limb
+
+ VN V14, X0, V5 // V5: first 7 bytes limb
+ VN V14, Y0, V10 // V10: first 7 bytes limb
+ VN V14, V13, V13 // v13: third 7 bytes limb
+ VN V14, V8, V8 // V8: third 7 bytes limb
+
+ VMSLG V10, V5, V12, V10 // v10: l10 x l5 (column 1)
+ VMSLG V8, V5, V12, V8 // v8: l8 x l5
+ VMSLG V6, V13, V12, V13 // v13: l6 x l3
+ VMSLG V6, V11, V12, V11 // v11: l6 x l1 (column 9)
+ VMSLG V6, V5, V12, V6 // v6: l6 x l5
+
+ MOVD $p256vmsl<>+0x00(SB), CPOOL
+ VGBM $0x7f7f, V14
+
+ VL 0(CPOOL), V4
+ VL 16(CPOOL), V7
+ VL 32(CPOOL), V9
+ VL 48(CPOOL), V5
+ VLM 64(CPOOL), V16, V19
+
+ VPERM V12, X0, V4, V4 // v4: limb4 | limb5
+ VPERM Y1, Y0, V7, V7
+ VPERM V12, Y0, V9, V9 // v9: limb10 | limb9
+ VPERM X1, X0, V5, V5
+ VPERM X1, X0, V16, V16
+ VPERM Y1, Y0, V17, V17
+ VPERM X1, V12, V18, V18 // v18: limb1 | limb2
+ VPERM Y1, V12, V19, V19 // v19: limb7 | limb6
+ VN V14, V7, V7 // v7: limb9 | limb8
+ VN V14, V5, V5 // v5: limb3 | limb4
+ VN V14, V16, V16 // v16: limb2 | limb3
+ VN V14, V17, V17 // v17: limb8 | limb7
+
+ VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2)
+ VMSLG V9, V5, V8, V8 // v8: l10 x l9 + l3 x l4 + l8 x l5 (column 3)
+ VMSLG V9, V16, V12, V16 // v16: l10 x l9 + l2 x l3
+ VMSLG V9, V18, V12, V9 // v9: l10 x l1 + l9 x l2
+ VMSLG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2
+ VMSLG V17, V4, V16, V16 // v16: l8 x l4 + l7 x l5 + l10 x l9 + l2 x l3 (column 4)
+ VMSLG V17, V5, V9, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4
+ VMSLG V17, V18, V12, V17 // v18: l8 x l1 + l7 x l2
+ VMSLG V19, V5, V7, V7 // v7: l9 x l1 + l8 x l2 + l7 x l3 + l6 x l4 (column 6)
+ VMSLG V19, V18, V12, V19 // v19: l7 x l1 + l6 x l2 (column 8)
+ VAQ V9, V6, V9 // v9: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
+ VAQ V17, V13, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
+
+ VSLDB $9, V12, V10, V4
+ VSLDB $9, V12, V7, V5
+ VAQ V4, V14, V14
+ VAQ V5, V13, V13
+
+ VSLDB $9, V12, V14, V4
+ VSLDB $9, V12, V13, V5
+ VAQ V4, V8, V8
+ VAQ V5, V19, V19
+
+ VSLDB $9, V12, V8, V4
+ VSLDB $9, V12, V19, V5
+ VAQ V4, V16, V16
+ VAQ V5, V11, V11
+
+ VSLDB $9, V12, V16, V4
+ VAQ V4, V9, V17
+
+ VGBM $0x007f, V4
+ VGBM $0x00ff, V5
+
+ VN V10, V4, V10
+ VN V14, V4, V14
+ VN V8, V4, V8
+ VN V16, V4, V16
+ VN V17, V4, V9
+ VN V7, V4, V7
+ VN V13, V4, V13
+ VN V19, V4, V19
+ VN V11, V5, V11
+
+ VSLDB $7, V14, V14, V14
+ VSLDB $14, V8, V12, V4
+ VSLDB $14, V12, V8, V8
+ VSLDB $5, V16, V16, V16
+ VSLDB $12, V9, V12, V5
+
+ VO V14, V10, V10
+ VO V8, V16, V16
+ VO V4, V10, V10 // first rightmost 128bits of the multiplication result
+ VO V5, V16, V16 // second rightmost 128bits of the multiplication result
+
+ // adjust v7, v13, v19, v11
+ VSLDB $7, V13, V13, V13
+ VSLDB $14, V19, V12, V4
+ VSLDB $14, V12, V19, V19
+ VSLDB $5, V11, V12, V5
+ VO V13, V7, V7
+ VO V4, V7, V7
+ VO V19, V5, V11
+
+ VSLDB $9, V12, V17, V14
+ VSLDB $12, V12, V9, V9
+ VACCQ V7, V14, V13
+ VAQ V7, V14, V7
+ VAQ V11, V13, V11
+
+ // First reduction, 96 bits
+ VSLDB $4, V16, V10, T0
+ VSLDB $4, V12, V16, T1
+ VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
+ VSLDB $3, V7, V12, V7
+ OBSERVATION3(V10, V8, T2, V17, V18, V19)// results V8 | T2
+ VO V7, V9, V7 // third rightmost 128bits of the multiplication result
+ VACCQ T0, T2, V9
+ VAQ T0, T2, T2
+ VACQ T1, V8, V9, V8
+
+ // Second reduction 96 bits
+ VSLDB $4, V8, T2, T0
+ VSLDB $4, V12, V8, T1
+ OBSERVATION3(T2, V9, V8, V17, V18, V19)// results V9 | V8
+ VACCQ T0, V8, T2
+ VAQ T0, V8, V8
+ VACQ T1, V9, T2, V9
+
+ // Third reduction 64 bits
+ VSLDB $8, V9, V8, T0
+ VSLDB $8, V12, V9, T1
+ OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
+ VACCQ T0, V13, V12
+ VAQ T0, V13, V13
+ VACQ T1, V14, V12, V14
+ VACCQ V13, V7, V12
+ VAQ V13, V7, T0
+ VACCCQ V14, V11, V12, T2
+ VACQ V14, V11, V12, T1 // results T2 | T1 | T0
+
+ // ---------------------------------------------------
+ MOVD $p256mul<>+0x00(SB), CPOOL
+
+ VZERO V12
+ VSCBIQ P0, T0, V8
+ VSQ P0, T0, V7
+ VSBCBIQ T1, P1, V8, V10
+ VSBIQ T1, P1, V8, V9
+ VSBIQ T2, V12, V10, T2
+
+ // what output to use, V9||V7 or T1||T0?
+ VSEL T0, V7, T2, T0
+ VSEL T1, V9, T2, T1
+
+ VLM (SCRATCH), V16, V19
+
+ RET
+
+// ---------------------------------------
+// p256SqrInternalVMSL
+// V0-V1,V30,V31 - Not Modified
+// V4-V14 - Volatile
+
+TEXT ·p256SqrInternalVMSL(SB), NOFRAME|NOSPLIT, $0-0
+ VSTM V16, V18, (SCRATCH)
+
+ MOVD $p256vmsl<>+0x00(SB), CPOOL
+ // Divide input into limbs
+ VGBM $0x007f, V14
+ VZERO V12
+ VSLDB $2, X1, X0, V13
+ VSLDB $4, V12, X1, V11 // V11(X1): 4 bytes limb
+
+ VN V14, X0, V10 // V10: first 7 bytes limb
+ VN V14, V13, V13 // v13: third 7 bytes limb
+
+ VMSLG V10, V10, V12, V10 // v10: l10 x l5 (column 1)
+ VMSLG V13, V13, V12, V13 // v13: l8 x l3
+ VMSLG V11, V11, V12, V11 // v11: l6 x l1 (column 9)
+
+ MOVD $p256vmsl<>+0x00(SB), CPOOL
+ VGBM $0x7f7f, V14
+
+ VL 0(CPOOL), V4
+ VL 16(CPOOL), V7
+ VL 32(CPOOL), V9
+ VL 48(CPOOL), V5
+ VLM 64(CPOOL), V16, V18
+ VL 112(CPOOL), V8
+
+ VPERM V12, X0, V4, V4 // v4: limb4 | limb5
+ VPERM X1, X0, V7, V7
+ VPERM V12, X0, V9, V9 // v9: limb10 | limb9
+ VPERM X1, X0, V5, V5
+ VPERM X1, X0, V16, V16
+ VPERM X1, X0, V17, V17
+ VPERM X1, V12, V18, V18 // v18: limb1 | limb2
+ VPERM X1, V12, V8, V8 // v8: limb7 | limb6
+ VN V14, V7, V7 // v7: limb9 | limb8
+ VN V14, V5, V5 // v5: limb3 | limb4
+ VN V14, V16, V16 // v16: limb2 | limb3
+ VN V14, V17, V17 // v17: limb8 | limb7
+
+ VMSLEOG V9, V18, V13, V6 // v6: l10 x l1 + l9 x l2 + l8 x l3 + l7 x l4 + l6 x l5 (column 5)
+ VMSLG V9, V4, V12, V14 // v14: l10 x l4 + l9 x l5 (column 2)
+ VMSLEOG V9, V16, V12, V16 // v16: l10 x l2 + l9 x l3 + l8 x l4 + l7 x l5 (column 4)
+ VMSLEOG V7, V18, V12, V7 // v7: l9 x l1 + l8 x l2 (column 6)
+ VMSLEG V17, V18, V12, V13 // v13: l8 x l1 + l7 x l2 + l6 x l3 (column 7)
+ VMSLG V8, V18, V12, V8 // v8: l7 x l1 + l6 x l2 (column 8)
+ VMSLEG V9, V5, V12, V18 // v18: l10 x l3 + l9 x l4 + l8 x l5 (column 3)
+
+ VSLDB $9, V12, V10, V4
+ VSLDB $9, V12, V7, V5
+ VAQ V4, V14, V14
+ VAQ V5, V13, V13
+
+ VSLDB $9, V12, V14, V4
+ VSLDB $9, V12, V13, V5
+ VAQ V4, V18, V18
+ VAQ V5, V8, V8
+
+ VSLDB $9, V12, V18, V4
+ VSLDB $9, V12, V8, V5
+ VAQ V4, V16, V16
+ VAQ V5, V11, V11
+
+ VSLDB $9, V12, V16, V4
+ VAQ V4, V6, V17
+
+ VGBM $0x007f, V4
+ VGBM $0x00ff, V5
+
+ VN V10, V4, V10
+ VN V14, V4, V14
+ VN V18, V4, V18
+ VN V16, V4, V16
+ VN V17, V4, V9
+ VN V7, V4, V7
+ VN V13, V4, V13
+ VN V8, V4, V8
+ VN V11, V5, V11
+
+ VSLDB $7, V14, V14, V14
+ VSLDB $14, V18, V12, V4
+ VSLDB $14, V12, V18, V18
+ VSLDB $5, V16, V16, V16
+ VSLDB $12, V9, V12, V5
+
+ VO V14, V10, V10
+ VO V18, V16, V16
+ VO V4, V10, V10 // first rightmost 128bits of the multiplication result
+ VO V5, V16, V16 // second rightmost 128bits of the multiplication result
+
+ // adjust v7, v13, v8, v11
+ VSLDB $7, V13, V13, V13
+ VSLDB $14, V8, V12, V4
+ VSLDB $14, V12, V8, V8
+ VSLDB $5, V11, V12, V5
+ VO V13, V7, V7
+ VO V4, V7, V7
+ VO V8, V5, V11
+
+ VSLDB $9, V12, V17, V14
+ VSLDB $12, V12, V9, V9
+ VACCQ V7, V14, V13
+ VAQ V7, V14, V7
+ VAQ V11, V13, V11
+
+ // First reduction, 96 bits
+ VSLDB $4, V16, V10, T0
+ VSLDB $4, V12, V16, T1
+ VSLDB $3, V11, V7, V11 // fourth rightmost 128bits of the multiplication result
+ VSLDB $3, V7, V12, V7
+ OBSERVATION3(V10, V8, T2, V16, V17, V18)// results V8 | T2
+ VO V7, V9, V7 // third rightmost 128bits of the multiplication result
+ VACCQ T0, T2, V9
+ VAQ T0, T2, T2
+ VACQ T1, V8, V9, V8
+
+ // Second reduction 96 bits
+ VSLDB $4, V8, T2, T0
+ VSLDB $4, V12, V8, T1
+ OBSERVATION3(T2, V9, V8, V16, V17, V18)// results V9 | V8
+ VACCQ T0, V8, T2
+ VAQ T0, V8, V8
+ VACQ T1, V9, T2, V9
+
+ // Third reduction 64 bits
+ VSLDB $8, V9, V8, T0
+ VSLDB $8, V12, V9, T1
+ OBSERVATION3A(V8, V14, V13, V17, V18)// results V14 | V13
+ VACCQ T0, V13, V12
+ VAQ T0, V13, V13
+ VACQ T1, V14, V12, V14
+ VACCQ V13, V7, V12
+ VAQ V13, V7, T0
+ VACCCQ V14, V11, V12, T2
+ VACQ V14, V11, V12, T1 // results T2 | T1 | T0
+
+ // ---------------------------------------------------
+ MOVD $p256mul<>+0x00(SB), CPOOL
+
+ VZERO V12
+ VSCBIQ P0, T0, V8
+ VSQ P0, T0, V7
+ VSBCBIQ T1, P1, V8, V10
+ VSBIQ T1, P1, V8, V9
+ VSBIQ T2, V12, V10, T2
+
+ // what output to use, V9||V7 or T1||T0?
+ VSEL T0, V7, T2, T0
+ VSEL T1, V9, T2, T1
+
+ VLM (SCRATCH), V16, V18
+ RET
+
+
+
+#undef CPOOL
+#undef SCRATCH
+#undef X0
+#undef X1
+#undef Y0
+#undef Y1
+#undef T0
+#undef T1
+#undef T2
+#undef P0
+#undef P1
+
+#define SCRATCH R9
+
+TEXT p256MulInternal<>(SB),NOSPLIT,$64-0
+ MOVD $scratch-64(SP), SCRATCH
+ MOVD ·p256MulInternalFacility+0x00(SB),R7
+ CALL (R7)
+ RET
+
+TEXT ·p256MulInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
+ MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
+ MOVD $·p256MulInternalFacility+0x00(SB), R7
+ MOVD $·p256MulInternalVX(SB), R8
+ CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported
+ MOVD $·p256MulInternalVMSL(SB), R8
+novmsl:
+ MOVD R8, 0(R7)
+ BR (R8)
+
+GLOBL ·p256MulInternalFacility+0x00(SB), NOPTR, $8
+DATA ·p256MulInternalFacility+0x00(SB)/8, $·p256MulInternalTrampolineSetup(SB)
+
+// Parameters
+#define X0 V0
+#define X1 V1
+#define Y0 V2
+#define Y1 V3
+
+TEXT ·p256SqrInternalVX(SB), NOFRAME|NOSPLIT, $0
+ VLR X0, Y0
+ VLR X1, Y1
+ BR ·p256MulInternalVX(SB)
+
+#undef X0
+#undef X1
+#undef Y0
+#undef Y1
+
+
+TEXT p256SqrInternal<>(SB),NOSPLIT,$48-0
+ MOVD $scratch-48(SP), SCRATCH
+ MOVD ·p256SqrInternalFacility+0x00(SB),R7
+ CALL (R7)
+ RET
+
+TEXT ·p256SqrInternalTrampolineSetup(SB),NOSPLIT|NOFRAME, $0
+ MOVBZ internal∕cpu·S390X+const_offsetS390xHasVE1(SB), R0
+ MOVD $·p256SqrInternalFacility+0x00(SB), R7
+ MOVD $·p256SqrInternalVX(SB), R8
+ CMPBEQ R0, $0, novmsl // VE1 facility = 1, VMSL supported
+ MOVD $·p256SqrInternalVMSL(SB), R8
+novmsl:
+ MOVD R8, 0(R7)
+ BR (R8)
+
+
+GLOBL ·p256SqrInternalFacility+0x00(SB), NOPTR, $8
+DATA ·p256SqrInternalFacility+0x00(SB)/8, $·p256SqrInternalTrampolineSetup(SB)
+
+#undef SCRATCH
+
+
+#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \
+ VZERO ZER \
+ VSCBIQ Y0, X0, CAR1 \
+ VSQ Y0, X0, T0 \
+ VSBCBIQ X1, Y1, CAR1, SEL1 \
+ VSBIQ X1, Y1, CAR1, T1 \
+ VSQ SEL1, ZER, SEL1 \
+ \
+ VACCQ T0, PL, CAR1 \
+ VAQ T0, PL, TT0 \
+ VACQ T1, PH, CAR1, TT1 \
+ \
+ VSEL T0, TT0, SEL1, T0 \
+ VSEL T1, TT1, SEL1, T1 \
+
+#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \
+ VACCQ X0, Y0, CAR1 \
+ VAQ X0, Y0, T0 \
+ VACCCQ X1, Y1, CAR1, T2 \
+ VACQ X1, Y1, CAR1, T1 \
+ \
+ VZERO ZER \
+ VSCBIQ PL, T0, CAR1 \
+ VSQ PL, T0, TT0 \
+ VSBCBIQ T1, PH, CAR1, CAR2 \
+ VSBIQ T1, PH, CAR1, TT1 \
+ VSBIQ T2, ZER, CAR2, SEL1 \
+ \
+ VSEL T0, TT0, SEL1, T0 \
+ VSEL T1, TT1, SEL1, T1
+
+#define p256HalfInternal(T1, T0, X1, X0) \
+ VZERO ZER \
+ VSBIQ ZER, ZER, X0, SEL1 \
+ \
+ VACCQ X0, PL, CAR1 \
+ VAQ X0, PL, T0 \
+ VACCCQ X1, PH, CAR1, T2 \
+ VACQ X1, PH, CAR1, T1 \
+ \
+ VSEL X0, T0, SEL1, T0 \
+ VSEL X1, T1, SEL1, T1 \
+ VSEL ZER, T2, SEL1, T2 \
+ \
+ VSLDB $15, T2, ZER, TT1 \
+ VSLDB $15, T1, ZER, TT0 \
+ VREPIB $1, SEL1 \
+ VSRL SEL1, T0, T0 \
+ VSRL SEL1, T1, T1 \
+ VREPIB $7, SEL1 \
+ VSL SEL1, TT0, TT0 \
+ VSL SEL1, TT1, TT1 \
+ VO T0, TT0, T0 \
+ VO T1, TT1, T1
+
+// ---------------------------------------
+// func p256MulAsm(res, in1, in2 []byte)
+#define res_ptr R1
+#define x_ptr R2
+#define y_ptr R3
+#define CPOOL R4
+
+// Parameters
+#define X0 V0
+#define X1 V1
+#define Y0 V2
+#define Y1 V3
+#define T0 V4
+#define T1 V5
+
+// Constants
+#define P0 V30
+#define P1 V31
+TEXT ·p256MulAsm(SB), NOSPLIT, $0
+ MOVD res+0(FP), res_ptr
+ MOVD in1+24(FP), x_ptr
+ MOVD in2+48(FP), y_ptr
+
+ VL (1*16)(x_ptr), X0
+ VL (0*16)(x_ptr), X1
+ VL (1*16)(y_ptr), Y0
+ VL (0*16)(y_ptr), Y1
+
+ MOVD $p256mul<>+0x00(SB), CPOOL
+ VL 16(CPOOL), P0
+ VL 0(CPOOL), P1
+
+ CALL p256MulInternal<>(SB)
+
+ VST T0, (1*16)(res_ptr)
+ VST T1, (0*16)(res_ptr)
+ RET
+
+#undef res_ptr
+#undef x_ptr
+#undef y_ptr
+#undef CPOOL
+
+#undef X0
+#undef X1
+#undef Y0
+#undef Y1
+#undef T0
+#undef T1
+#undef P0
+#undef P1
+
+// ---------------------------------------
+// func p256SqrAsm(res, in1 []byte)
+#define res_ptr R1
+#define x_ptr R2
+#define y_ptr R3
+#define CPOOL R4
+
+// Parameters
+#define X0 V0
+#define X1 V1
+#define T0 V4
+#define T1 V5
+
+// Constants
+#define P0 V30
+#define P1 V31
+TEXT ·p256SqrAsm(SB), NOSPLIT, $0
+ MOVD res+0(FP), res_ptr
+ MOVD in1+24(FP), x_ptr
+
+ VL (1*16)(x_ptr), X0
+ VL (0*16)(x_ptr), X1
+
+ MOVD $p256mul<>+0x00(SB), CPOOL
+ VL 16(CPOOL), P0
+ VL 0(CPOOL), P1
+
+ CALL p256SqrInternal<>(SB)
+
+ VST T0, (1*16)(res_ptr)
+ VST T1, (0*16)(res_ptr)
+ RET
+
+#undef res_ptr
+#undef x_ptr
+#undef y_ptr
+#undef CPOOL
+
+#undef X0
+#undef X1
+#undef T0
+#undef T1
+#undef P0
+#undef P1
+
+
+// Point add with P2 being affine point
+// If sign == 1 -> P2 = -P2
+// If sel == 0 -> P3 = P1
+// if zero == 0 -> P3 = P2
+// p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
+#define P3ptr R1
+#define P1ptr R2
+#define P2ptr R3
+#define CPOOL R4
+
+// Temporaries in REGs
+#define Y2L V15
+#define Y2H V16
+#define T1L V17
+#define T1H V18
+#define T2L V19
+#define T2H V20
+#define T3L V21
+#define T3H V22
+#define T4L V23
+#define T4H V24
+
+// Temps for Sub and Add
+#define TT0 V11
+#define TT1 V12
+#define T2 V13
+
+// p256MulAsm Parameters
+#define X0 V0
+#define X1 V1
+#define Y0 V2
+#define Y1 V3
+#define T0 V4
+#define T1 V5
+
+#define PL V30
+#define PH V31
+
+// Names for zero/sel selects
+#define X1L V0
+#define X1H V1
+#define Y1L V2 // p256MulAsmParmY
+#define Y1H V3 // p256MulAsmParmY
+#define Z1L V4
+#define Z1H V5
+#define X2L V0
+#define X2H V1
+#define Z2L V4
+#define Z2H V5
+#define X3L V17 // T1L
+#define X3H V18 // T1H
+#define Y3L V21 // T3L
+#define Y3H V22 // T3H
+#define Z3L V28
+#define Z3H V29
+
+#define ZER V6
+#define SEL1 V7
+#define CAR1 V8
+#define CAR2 V9
+/* *
+ * Three operand formula:
+ * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
+ * T1 = Z1²
+ * T2 = T1*Z1
+ * T1 = T1*X2
+ * T2 = T2*Y2
+ * T1 = T1-X1
+ * T2 = T2-Y1
+ * Z3 = Z1*T1
+ * T3 = T1²
+ * T4 = T3*T1
+ * T3 = T3*X1
+ * T1 = 2*T3
+ * X3 = T2²
+ * X3 = X3-T1
+ * X3 = X3-T4
+ * T3 = T3-X3
+ * T3 = T3*T2
+ * T4 = T4*Y1
+ * Y3 = T3-T4
+
+ * Three operand formulas, but with MulInternal X,Y used to store temps
+X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1
+X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2
+X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2
+X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2
+SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
+SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
+X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2
+X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2
+X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4
+X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4
+ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
+X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4
+SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4
+SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
+SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
+X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4
+X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4
+SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4
+
+ */
+TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0
+ MOVD P3+0(FP), P3ptr
+ MOVD P1+8(FP), P1ptr
+ MOVD P2+16(FP), P2ptr
+
+ MOVD $p256mul<>+0x00(SB), CPOOL
+ VL 16(CPOOL), PL
+ VL 0(CPOOL), PH
+
+ // if (sign == 1) {
+ // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2
+ // }
+
+ VL 32(P2ptr), Y2H
+ VL 48(P2ptr), Y2L
+
+ VLREPG sign+24(FP), SEL1
+ VZERO ZER
+ VCEQG SEL1, ZER, SEL1
+
+ VSCBIQ Y2L, PL, CAR1
+ VSQ Y2L, PL, T1L
+ VSBIQ PH, Y2H, CAR1, T1H
+
+ VSEL Y2L, T1L, SEL1, Y2L
+ VSEL Y2H, T1H, SEL1, Y2H
+
+/* *
+ * Three operand formula:
+ * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
+ */
+ // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1
+ VL 64(P1ptr), X1 // Z1H
+ VL 80(P1ptr), X0 // Z1L
+ VLR X0, Y0
+ VLR X1, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2
+ VLR T0, X0
+ VLR T1, X1
+ CALL p256MulInternal<>(SB)
+ VLR T0, T2L
+ VLR T1, T2H
+
+ // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2
+ VL 0(P2ptr), Y1 // X2H
+ VL 16(P2ptr), Y0 // X2L
+ CALL p256MulInternal<>(SB)
+ VLR T0, T1L
+ VLR T1, T1H
+
+ // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2
+ VLR T2L, X0
+ VLR T2H, X1
+ VLR Y2L, Y0
+ VLR Y2H, Y1
+ CALL p256MulInternal<>(SB)
+
+ // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2
+ VL 32(P1ptr), Y1H
+ VL 48(P1ptr), Y1L
+ p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L)
+
+ // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2
+ VL 0(P1ptr), X1H
+ VL 16(P1ptr), X1L
+ p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L)
+
+ // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2
+ VL 64(P1ptr), X1 // Z1H
+ VL 80(P1ptr), X0 // Z1L
+ CALL p256MulInternal<>(SB)
+
+ // VST T1, 64(P3ptr)
+ // VST T0, 80(P3ptr)
+ VLR T0, Z3L
+ VLR T1, Z3H
+
+ // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2
+ VLR Y0, X0
+ VLR Y1, X1
+ CALL p256SqrInternal<>(SB)
+ VLR T0, X0
+ VLR T1, X1
+
+ // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4
+ CALL p256MulInternal<>(SB)
+ VLR T0, T4L
+ VLR T1, T4H
+
+ // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4
+ VL 0(P1ptr), Y1 // X1H
+ VL 16(P1ptr), Y0 // X1L
+ CALL p256MulInternal<>(SB)
+ VLR T0, T3L
+ VLR T1, T3H
+
+ // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4
+ p256AddInternal(T1H,T1L, T1,T0,T1,T0)
+
+ // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4
+ VLR T2L, X0
+ VLR T2H, X1
+ VLR T2L, Y0
+ VLR T2H, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3)
+ p256SubInternal(T1,T0,T1,T0,T1H,T1L)
+
+ // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4
+ p256SubInternal(T1,T0,T1,T0,T4H,T4L)
+ VLR T0, X3L
+ VLR T1, X3H
+
+ // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4
+ p256SubInternal(X1,X0,T3H,T3L,T1,T0)
+
+ // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4
+ CALL p256MulInternal<>(SB)
+ VLR T0, T3L
+ VLR T1, T3H
+
+ // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4
+ VLR T4L, X0
+ VLR T4H, X1
+ VL 32(P1ptr), Y1 // Y1H
+ VL 48(P1ptr), Y0 // Y1L
+ CALL p256MulInternal<>(SB)
+
+ // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3)
+ p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0)
+
+ // if (sel == 0) {
+ // copy(P3.x[:], X1)
+ // copy(P3.y[:], Y1)
+ // copy(P3.z[:], Z1)
+ // }
+
+ VL 0(P1ptr), X1H
+ VL 16(P1ptr), X1L
+
+ // Y1 already loaded, left over from addition
+ VL 64(P1ptr), Z1H
+ VL 80(P1ptr), Z1L
+
+ VLREPG sel+32(FP), SEL1
+ VZERO ZER
+ VCEQG SEL1, ZER, SEL1
+
+ VSEL X1L, X3L, SEL1, X3L
+ VSEL X1H, X3H, SEL1, X3H
+ VSEL Y1L, Y3L, SEL1, Y3L
+ VSEL Y1H, Y3H, SEL1, Y3H
+ VSEL Z1L, Z3L, SEL1, Z3L
+ VSEL Z1H, Z3H, SEL1, Z3H
+
+ // if (zero == 0) {
+ // copy(P3.x[:], X2)
+ // copy(P3.y[:], Y2)
+ // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p
+ // }
+ VL 0(P2ptr), X2H
+ VL 16(P2ptr), X2L
+
+ // Y2 already loaded
+ VL 128(CPOOL), Z2H
+ VL 144(CPOOL), Z2L
+
+ VLREPG zero+40(FP), SEL1
+ VZERO ZER
+ VCEQG SEL1, ZER, SEL1
+
+ VSEL X2L, X3L, SEL1, X3L
+ VSEL X2H, X3H, SEL1, X3H
+ VSEL Y2L, Y3L, SEL1, Y3L
+ VSEL Y2H, Y3H, SEL1, Y3H
+ VSEL Z2L, Z3L, SEL1, Z3L
+ VSEL Z2H, Z3H, SEL1, Z3H
+
+ // All done, store out the result!!!
+ VST X3H, 0(P3ptr)
+ VST X3L, 16(P3ptr)
+ VST Y3H, 32(P3ptr)
+ VST Y3L, 48(P3ptr)
+ VST Z3H, 64(P3ptr)
+ VST Z3L, 80(P3ptr)
+
+ RET
+
+#undef P3ptr
+#undef P1ptr
+#undef P2ptr
+#undef CPOOL
+
+#undef Y2L
+#undef Y2H
+#undef T1L
+#undef T1H
+#undef T2L
+#undef T2H
+#undef T3L
+#undef T3H
+#undef T4L
+#undef T4H
+
+#undef TT0
+#undef TT1
+#undef T2
+
+#undef X0
+#undef X1
+#undef Y0
+#undef Y1
+#undef T0
+#undef T1
+
+#undef PL
+#undef PH
+
+#undef X1L
+#undef X1H
+#undef Y1L
+#undef Y1H
+#undef Z1L
+#undef Z1H
+#undef X2L
+#undef X2H
+#undef Z2L
+#undef Z2H
+#undef X3L
+#undef X3H
+#undef Y3L
+#undef Y3H
+#undef Z3L
+#undef Z3H
+
+#undef ZER
+#undef SEL1
+#undef CAR1
+#undef CAR2
+
+// p256PointDoubleAsm(P3, P1 *p256Point)
+// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl
+// https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
+// https://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html
+#define P3ptr R1
+#define P1ptr R2
+#define CPOOL R4
+
+// Temporaries in REGs
+#define X3L V15
+#define X3H V16
+#define Y3L V17
+#define Y3H V18
+#define T1L V19
+#define T1H V20
+#define T2L V21
+#define T2H V22
+#define T3L V23
+#define T3H V24
+
+#define X1L V6
+#define X1H V7
+#define Y1L V8
+#define Y1H V9
+#define Z1L V10
+#define Z1H V11
+
+// Temps for Sub and Add
+#define TT0 V11
+#define TT1 V12
+#define T2 V13
+
+// p256MulAsm Parameters
+#define X0 V0
+#define X1 V1
+#define Y0 V2
+#define Y1 V3
+#define T0 V4
+#define T1 V5
+
+#define PL V30
+#define PH V31
+
+#define Z3L V23
+#define Z3H V24
+
+#define ZER V26
+#define SEL1 V27
+#define CAR1 V28
+#define CAR2 V29
+/*
+ * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv
+ * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3.
+ * Source: 2004 Hankerson–Menezes–Vanstone, page 91.
+ * A = 3(X₁-Z₁²)×(X₁+Z₁²)
+ * B = 2Y₁
+ * Z₃ = B×Z₁
+ * C = B²
+ * D = C×X₁
+ * X₃ = A²-2D
+ * Y₃ = (D-X₃)×A-C²/2
+ *
+ * Three-operand formula:
+ * T1 = Z1²
+ * T2 = X1-T1
+ * T1 = X1+T1
+ * T2 = T2*T1
+ * T2 = 3*T2
+ * Y3 = 2*Y1
+ * Z3 = Y3*Z1
+ * Y3 = Y3²
+ * T3 = Y3*X1
+ * Y3 = Y3²
+ * Y3 = half*Y3
+ * X3 = T2²
+ * T1 = 2*T3
+ * X3 = X3-T1
+ * T1 = T3-X3
+ * T1 = T1*T2
+ * Y3 = T1-Y3
+ */
+
+TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0
+ MOVD P3+0(FP), P3ptr
+ MOVD P1+8(FP), P1ptr
+
+ MOVD $p256mul<>+0x00(SB), CPOOL
+ VL 16(CPOOL), PL
+ VL 0(CPOOL), PH
+
+ // X=Z1; Y=Z1; MUL; T- // T1 = Z1²
+ VL 64(P1ptr), X1 // Z1H
+ VL 80(P1ptr), X0 // Z1L
+ VLR X0, Y0
+ VLR X1, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // SUB(X<X1-T) // T2 = X1-T1
+ VL 0(P1ptr), X1H
+ VL 16(P1ptr), X1L
+ p256SubInternal(X1,X0,X1H,X1L,T1,T0)
+
+ // ADD(Y<X1+T) // T1 = X1+T1
+ p256AddInternal(Y1,Y0,X1H,X1L,T1,T0)
+
+ // X- ; Y- ; MUL; T- // T2 = T2*T1
+ CALL p256MulInternal<>(SB)
+
+ // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2
+ p256AddInternal(T2H,T2L,T1,T0,T1,T0)
+ p256AddInternal(T2H,T2L,T2H,T2L,T1,T0)
+
+ // ADD(X<Y1+Y1) // Y3 = 2*Y1
+ VL 32(P1ptr), Y1H
+ VL 48(P1ptr), Y1L
+ p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L)
+
+ // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1
+ VL 64(P1ptr), Y1 // Z1H
+ VL 80(P1ptr), Y0 // Z1L
+ CALL p256MulInternal<>(SB)
+ VST T1, 64(P3ptr)
+ VST T0, 80(P3ptr)
+
+ // X- ; Y=X ; MUL; T- // Y3 = Y3²
+ VLR X0, Y0
+ VLR X1, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1
+ VLR T0, X0
+ VLR T1, X1
+ VL 0(P1ptr), Y1
+ VL 16(P1ptr), Y0
+ CALL p256MulInternal<>(SB)
+ VLR T0, T3L
+ VLR T1, T3H
+
+ // X- ; Y=X ; MUL; T- // Y3 = Y3²
+ VLR X0, Y0
+ VLR X1, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // HAL(Y3<T) // Y3 = half*Y3
+ p256HalfInternal(Y3H,Y3L, T1,T0)
+
+ // X=T2; Y=T2; MUL; T- // X3 = T2²
+ VLR T2L, X0
+ VLR T2H, X1
+ VLR T2L, Y0
+ VLR T2H, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // ADD(T1<T3+T3) // T1 = 2*T3
+ p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L)
+
+ // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1
+ p256SubInternal(X3H,X3L,T1,T0,T1H,T1L)
+ VST X3H, 0(P3ptr)
+ VST X3L, 16(P3ptr)
+
+ // SUB(X<T3-X3) // T1 = T3-X3
+ p256SubInternal(X1,X0,T3H,T3L,X3H,X3L)
+
+ // X- ; Y- ; MUL; T- // T1 = T1*T2
+ CALL p256MulInternal<>(SB)
+
+ // SUB(Y3<T-Y3) // Y3 = T1-Y3
+ p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L)
+
+ VST Y3H, 32(P3ptr)
+ VST Y3L, 48(P3ptr)
+ RET
+
+#undef P3ptr
+#undef P1ptr
+#undef CPOOL
+#undef X3L
+#undef X3H
+#undef Y3L
+#undef Y3H
+#undef T1L
+#undef T1H
+#undef T2L
+#undef T2H
+#undef T3L
+#undef T3H
+#undef X1L
+#undef X1H
+#undef Y1L
+#undef Y1H
+#undef Z1L
+#undef Z1H
+#undef TT0
+#undef TT1
+#undef T2
+#undef X0
+#undef X1
+#undef Y0
+#undef Y1
+#undef T0
+#undef T1
+#undef PL
+#undef PH
+#undef Z3L
+#undef Z3H
+#undef ZER
+#undef SEL1
+#undef CAR1
+#undef CAR2
+
+// p256PointAddAsm(P3, P1, P2 *p256Point)
+#define P3ptr R1
+#define P1ptr R2
+#define P2ptr R3
+#define CPOOL R4
+#define ISZERO R5
+#define TRUE R6
+
+// Temporaries in REGs
+#define T1L V16
+#define T1H V17
+#define T2L V18
+#define T2H V19
+#define U1L V20
+#define U1H V21
+#define S1L V22
+#define S1H V23
+#define HL V24
+#define HH V25
+#define RL V26
+#define RH V27
+
+// Temps for Sub and Add
+#define ZER V6
+#define SEL1 V7
+#define CAR1 V8
+#define CAR2 V9
+#define TT0 V11
+#define TT1 V12
+#define T2 V13
+
+// p256MulAsm Parameters
+#define X0 V0
+#define X1 V1
+#define Y0 V2
+#define Y1 V3
+#define T0 V4
+#define T1 V5
+
+#define PL V30
+#define PH V31
+/*
+ * https://delta.cs.cinvestav.mx/~francisco/arith/julio.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields"
+ *
+ * A = X₁×Z₂²
+ * B = Y₁×Z₂³
+ * C = X₂×Z₁²-A
+ * D = Y₂×Z₁³-B
+ * X₃ = D² - 2A×C² - C³
+ * Y₃ = D×(A×C² - X₃) - B×C³
+ * Z₃ = Z₁×Z₂×C
+ *
+ * Three-operand formula (adopted): https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2
+ * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R
+ *
+ * T1 = Z1*Z1
+ * T2 = Z2*Z2
+ * U1 = X1*T2
+ * H = X2*T1
+ * H = H-U1
+ * Z3 = Z1*Z2
+ * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
+ *
+ * S1 = Z2*T2
+ * S1 = Y1*S1
+ * R = Z1*T1
+ * R = Y2*R
+ * R = R-S1
+ *
+ * T1 = H*H
+ * T2 = H*T1
+ * U1 = U1*T1
+ *
+ * X3 = R*R
+ * X3 = X3-T2
+ * T1 = 2*U1
+ * X3 = X3-T1 << store-out X3 result reg
+ *
+ * T2 = S1*T2
+ * Y3 = U1-X3
+ * Y3 = R*Y3
+ * Y3 = Y3-T2 << store-out Y3 result reg
+
+ // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
+ // X- ; Y=T ; MUL; R=T // R = Z1*T1
+ // X=X2; Y- ; MUL; H=T // H = X2*T1
+ // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
+ // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
+ // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
+ // SUB(H<H-T) // H = H-U1
+ // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
+ // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array
+ // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
+ // X=Y2; Y=R ; MUL; T- // R = Y2*R
+ // SUB(R<T-S1) // R = R-S1
+ // X=H ; Y=H ; MUL; T- // T1 = H*H
+ // X- ; Y=T ; MUL; T2=T // T2 = H*T1
+ // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
+ // X=R ; Y=R ; MUL; T- // X3 = R*R
+ // SUB(T<T-T2) // X3 = X3-T2
+ // ADD(X<U1+U1) // T1 = 2*U1
+ // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
+ // SUB(Y<U1-T) // Y3 = U1-X3
+ // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
+ // X=S1; Y=T2; MUL; T- // T2 = S1*T2
+ // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
+ */
+TEXT ·p256PointAddAsm(SB), NOSPLIT, $0
+ MOVD P3+0(FP), P3ptr
+ MOVD P1+8(FP), P1ptr
+ MOVD P2+16(FP), P2ptr
+
+ MOVD $p256mul<>+0x00(SB), CPOOL
+ VL 16(CPOOL), PL
+ VL 0(CPOOL), PH
+
+ // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1
+ VL 64(P1ptr), X1 // Z1H
+ VL 80(P1ptr), X0 // Z1L
+ VLR X0, Y0
+ VLR X1, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // X- ; Y=T ; MUL; R=T // R = Z1*T1
+ VLR T0, Y0
+ VLR T1, Y1
+ CALL p256MulInternal<>(SB)
+ VLR T0, RL
+ VLR T1, RH
+
+ // X=X2; Y- ; MUL; H=T // H = X2*T1
+ VL 0(P2ptr), X1 // X2H
+ VL 16(P2ptr), X0 // X2L
+ CALL p256MulInternal<>(SB)
+ VLR T0, HL
+ VLR T1, HH
+
+ // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2
+ VL 64(P2ptr), X1 // Z2H
+ VL 80(P2ptr), X0 // Z2L
+ VLR X0, Y0
+ VLR X1, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2
+ VLR T0, Y0
+ VLR T1, Y1
+ CALL p256MulInternal<>(SB)
+ VLR T0, S1L
+ VLR T1, S1H
+
+ // X=X1; Y- ; MUL; U1=T // U1 = X1*T2
+ VL 0(P1ptr), X1 // X1H
+ VL 16(P1ptr), X0 // X1L
+ CALL p256MulInternal<>(SB)
+ VLR T0, U1L
+ VLR T1, U1H
+
+ // SUB(H<H-T) // H = H-U1
+ p256SubInternal(HH,HL,HH,HL,T1,T0)
+
+ // if H == 0 or H^P == 0 then ret=1 else ret=0
+ // clobbers T1H and T1L
+ MOVD $0, ISZERO
+ MOVD $1, TRUE
+ VZERO ZER
+ VO HL, HH, T1H
+ VCEQGS ZER, T1H, T1H
+ MOVDEQ TRUE, ISZERO
+ VX HL, PL, T1L
+ VX HH, PH, T1H
+ VO T1L, T1H, T1H
+ VCEQGS ZER, T1H, T1H
+ MOVDEQ TRUE, ISZERO
+ MOVD ISZERO, ret+24(FP)
+
+ // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2
+ VL 64(P1ptr), X1 // Z1H
+ VL 80(P1ptr), X0 // Z1L
+ VL 64(P2ptr), Y1 // Z2H
+ VL 80(P2ptr), Y0 // Z2L
+ CALL p256MulInternal<>(SB)
+
+ // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H
+ VLR T0, X0
+ VLR T1, X1
+ VLR HL, Y0
+ VLR HH, Y1
+ CALL p256MulInternal<>(SB)
+ VST T1, 64(P3ptr)
+ VST T0, 80(P3ptr)
+
+ // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1
+ VL 32(P1ptr), X1
+ VL 48(P1ptr), X0
+ VLR S1L, Y0
+ VLR S1H, Y1
+ CALL p256MulInternal<>(SB)
+ VLR T0, S1L
+ VLR T1, S1H
+
+ // X=Y2; Y=R ; MUL; T- // R = Y2*R
+ VL 32(P2ptr), X1
+ VL 48(P2ptr), X0
+ VLR RL, Y0
+ VLR RH, Y1
+ CALL p256MulInternal<>(SB)
+
+ // SUB(R<T-S1) // R = T-S1
+ p256SubInternal(RH,RL,T1,T0,S1H,S1L)
+
+ // if R == 0 or R^P == 0 then ret=ret else ret=0
+ // clobbers T1H and T1L
+ MOVD $0, ISZERO
+ MOVD $1, TRUE
+ VZERO ZER
+ VO RL, RH, T1H
+ VCEQGS ZER, T1H, T1H
+ MOVDEQ TRUE, ISZERO
+ VX RL, PL, T1L
+ VX RH, PH, T1H
+ VO T1L, T1H, T1H
+ VCEQGS ZER, T1H, T1H
+ MOVDEQ TRUE, ISZERO
+ AND ret+24(FP), ISZERO
+ MOVD ISZERO, ret+24(FP)
+
+ // X=H ; Y=H ; MUL; T- // T1 = H*H
+ VLR HL, X0
+ VLR HH, X1
+ VLR HL, Y0
+ VLR HH, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // X- ; Y=T ; MUL; T2=T // T2 = H*T1
+ VLR T0, Y0
+ VLR T1, Y1
+ CALL p256MulInternal<>(SB)
+ VLR T0, T2L
+ VLR T1, T2H
+
+ // X=U1; Y- ; MUL; U1=T // U1 = U1*T1
+ VLR U1L, X0
+ VLR U1H, X1
+ CALL p256MulInternal<>(SB)
+ VLR T0, U1L
+ VLR T1, U1H
+
+ // X=R ; Y=R ; MUL; T- // X3 = R*R
+ VLR RL, X0
+ VLR RH, X1
+ VLR RL, Y0
+ VLR RH, Y1
+ CALL p256SqrInternal<>(SB)
+
+ // SUB(T<T-T2) // X3 = X3-T2
+ p256SubInternal(T1,T0,T1,T0,T2H,T2L)
+
+ // ADD(X<U1+U1) // T1 = 2*U1
+ p256AddInternal(X1,X0,U1H,U1L,U1H,U1L)
+
+ // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg
+ p256SubInternal(T1,T0,T1,T0,X1,X0)
+ VST T1, 0(P3ptr)
+ VST T0, 16(P3ptr)
+
+ // SUB(Y<U1-T) // Y3 = U1-X3
+ p256SubInternal(Y1,Y0,U1H,U1L,T1,T0)
+
+ // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3
+ VLR RL, X0
+ VLR RH, X1
+ CALL p256MulInternal<>(SB)
+ VLR T0, U1L
+ VLR T1, U1H
+
+ // X=S1; Y=T2; MUL; T- // T2 = S1*T2
+ VLR S1L, X0
+ VLR S1H, X1
+ VLR T2L, Y0
+ VLR T2H, Y1
+ CALL p256MulInternal<>(SB)
+
+ // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg
+ p256SubInternal(T1,T0,U1H,U1L,T1,T0)
+ VST T1, 32(P3ptr)
+ VST T0, 48(P3ptr)
+
+ RET