summaryrefslogtreecommitdiffstats
path: root/src/crypto/elliptic/p256_s390x.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/elliptic/p256_s390x.go')
-rw-r--r--src/crypto/elliptic/p256_s390x.go576
1 files changed, 576 insertions, 0 deletions
diff --git a/src/crypto/elliptic/p256_s390x.go b/src/crypto/elliptic/p256_s390x.go
new file mode 100644
index 0000000..735e9f5
--- /dev/null
+++ b/src/crypto/elliptic/p256_s390x.go
@@ -0,0 +1,576 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build s390x
+
+package elliptic
+
+import (
+ "crypto/subtle"
+ "internal/cpu"
+ "math/big"
+ "unsafe"
+)
+
+const (
+ offsetS390xHasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+ offsetS390xHasVE1 = unsafe.Offsetof(cpu.S390X.HasVXE)
+)
+
+type p256CurveFast struct {
+ *CurveParams
+}
+
+type p256Point struct {
+ x [32]byte
+ y [32]byte
+ z [32]byte
+}
+
+var (
+ p256 Curve
+ p256PreFast *[37][64]p256Point
+)
+
+//go:noescape
+func p256MulInternalTrampolineSetup()
+
+//go:noescape
+func p256SqrInternalTrampolineSetup()
+
+//go:noescape
+func p256MulInternalVX()
+
+//go:noescape
+func p256MulInternalVMSL()
+
+//go:noescape
+func p256SqrInternalVX()
+
+//go:noescape
+func p256SqrInternalVMSL()
+
+func initP256Arch() {
+ if cpu.S390X.HasVX {
+ p256 = p256CurveFast{p256Params}
+ initTable()
+ return
+ }
+
+ // No vector support, use pure Go implementation.
+ p256 = p256Curve{p256Params}
+ return
+}
+
+func (curve p256CurveFast) Params() *CurveParams {
+ return curve.CurveParams
+}
+
+// Functions implemented in p256_asm_s390x.s
+// Montgomery multiplication modulo P256
+//
+//go:noescape
+func p256SqrAsm(res, in1 []byte)
+
+//go:noescape
+func p256MulAsm(res, in1, in2 []byte)
+
+// Montgomery square modulo P256
+func p256Sqr(res, in []byte) {
+ p256SqrAsm(res, in)
+}
+
+// Montgomery multiplication by 1
+//
+//go:noescape
+func p256FromMont(res, in []byte)
+
+// iff cond == 1 val <- -val
+//
+//go:noescape
+func p256NegCond(val *p256Point, cond int)
+
+// if cond == 0 res <- b; else res <- a
+//
+//go:noescape
+func p256MovCond(res, a, b *p256Point, cond int)
+
+// Constant time table access
+//
+//go:noescape
+func p256Select(point *p256Point, table []p256Point, idx int)
+
+//go:noescape
+func p256SelectBase(point *p256Point, table []p256Point, idx int)
+
+// Montgomery multiplication modulo Ord(G)
+//
+//go:noescape
+func p256OrdMul(res, in1, in2 []byte)
+
+// Montgomery square modulo Ord(G), repeated n times
+func p256OrdSqr(res, in []byte, n int) {
+ copy(res, in)
+ for i := 0; i < n; i += 1 {
+ p256OrdMul(res, res, res)
+ }
+}
+
+// Point add with P2 being affine point
+// If sign == 1 -> P2 = -P2
+// If sel == 0 -> P3 = P1
+// if zero == 0 -> P3 = P2
+//
+//go:noescape
+func p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int)
+
+// Point add
+//
+//go:noescape
+func p256PointAddAsm(P3, P1, P2 *p256Point) int
+
+//go:noescape
+func p256PointDoubleAsm(P3, P1 *p256Point)
+
+func (curve p256CurveFast) Inverse(k *big.Int) *big.Int {
+ if k.Cmp(p256Params.N) >= 0 {
+ // This should never happen.
+ reducedK := new(big.Int).Mod(k, p256Params.N)
+ k = reducedK
+ }
+
+ // table will store precomputed powers of x. The 32 bytes at index
+ // i store x^(i+1).
+ var table [15][32]byte
+
+ x := fromBig(k)
+ // This code operates in the Montgomery domain where R = 2^256 mod n
+ // and n is the order of the scalar field. (See initP256 for the
+ // value.) Elements in the Montgomery domain take the form a×R and
+ // multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
+ // is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
+ // i.e. converts x into the Montgomery domain. Stored in BigEndian form
+ RR := []byte{0x66, 0xe1, 0x2d, 0x94, 0xf3, 0xd9, 0x56, 0x20, 0x28, 0x45, 0xb2, 0x39, 0x2b, 0x6b, 0xec, 0x59,
+ 0x46, 0x99, 0x79, 0x9c, 0x49, 0xbd, 0x6f, 0xa6, 0x83, 0x24, 0x4c, 0x95, 0xbe, 0x79, 0xee, 0xa2}
+
+ p256OrdMul(table[0][:], x, RR)
+
+ // Prepare the table, no need in constant time access, because the
+ // power is not a secret. (Entry 0 is never used.)
+ for i := 2; i < 16; i += 2 {
+ p256OrdSqr(table[i-1][:], table[(i/2)-1][:], 1)
+ p256OrdMul(table[i][:], table[i-1][:], table[0][:])
+ }
+
+ copy(x, table[14][:]) // f
+
+ p256OrdSqr(x[0:32], x[0:32], 4)
+ p256OrdMul(x[0:32], x[0:32], table[14][:]) // ff
+ t := make([]byte, 32)
+ copy(t, x)
+
+ p256OrdSqr(x, x, 8)
+ p256OrdMul(x, x, t) // ffff
+ copy(t, x)
+
+ p256OrdSqr(x, x, 16)
+ p256OrdMul(x, x, t) // ffffffff
+ copy(t, x)
+
+ p256OrdSqr(x, x, 64) // ffffffff0000000000000000
+ p256OrdMul(x, x, t) // ffffffff00000000ffffffff
+ p256OrdSqr(x, x, 32) // ffffffff00000000ffffffff00000000
+ p256OrdMul(x, x, t) // ffffffff00000000ffffffffffffffff
+
+ // Remaining 32 windows
+ expLo := [32]byte{0xb, 0xc, 0xe, 0x6, 0xf, 0xa, 0xa, 0xd, 0xa, 0x7, 0x1, 0x7, 0x9, 0xe, 0x8, 0x4,
+ 0xf, 0x3, 0xb, 0x9, 0xc, 0xa, 0xc, 0x2, 0xf, 0xc, 0x6, 0x3, 0x2, 0x5, 0x4, 0xf}
+ for i := 0; i < 32; i++ {
+ p256OrdSqr(x, x, 4)
+ p256OrdMul(x, x, table[expLo[i]-1][:])
+ }
+
+ // Multiplying by one in the Montgomery domain converts a Montgomery
+ // value out of the domain.
+ one := []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}
+ p256OrdMul(x, x, one)
+
+ return new(big.Int).SetBytes(x)
+}
+
+// fromBig converts a *big.Int into a format used by this code.
+func fromBig(big *big.Int) []byte {
+ // This could be done a lot more efficiently...
+ res := big.Bytes()
+ if 32 == len(res) {
+ return res
+ }
+ t := make([]byte, 32)
+ offset := 32 - len(res)
+ for i := len(res) - 1; i >= 0; i-- {
+ t[i+offset] = res[i]
+ }
+ return t
+}
+
+// p256GetMultiplier makes sure byte array will have 32 byte elements, If the scalar
+// is equal or greater than the order of the group, it's reduced modulo that order.
+func p256GetMultiplier(in []byte) []byte {
+ n := new(big.Int).SetBytes(in)
+
+ if n.Cmp(p256Params.N) >= 0 {
+ n.Mod(n, p256Params.N)
+ }
+ return fromBig(n)
+}
+
+// p256MulAsm operates in a Montgomery domain with R = 2^256 mod p, where p is the
+// underlying field of the curve. (See initP256 for the value.) Thus rr here is
+// R×R mod p. See comment in Inverse about how this is used.
+var rr = []byte{0x00, 0x00, 0x00, 0x04, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe,
+ 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03}
+
+// (This is one, in the Montgomery domain.)
+var one = []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}
+
+func maybeReduceModP(in *big.Int) *big.Int {
+ if in.Cmp(p256Params.P) < 0 {
+ return in
+ }
+ return new(big.Int).Mod(in, p256Params.P)
+}
+
+func (curve p256CurveFast) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
+ var r1, r2 p256Point
+ scalarReduced := p256GetMultiplier(baseScalar)
+ r1IsInfinity := scalarIsZero(scalarReduced)
+ r1.p256BaseMult(scalarReduced)
+
+ copy(r2.x[:], fromBig(maybeReduceModP(bigX)))
+ copy(r2.y[:], fromBig(maybeReduceModP(bigY)))
+ copy(r2.z[:], one)
+ p256MulAsm(r2.x[:], r2.x[:], rr[:])
+ p256MulAsm(r2.y[:], r2.y[:], rr[:])
+
+ scalarReduced = p256GetMultiplier(scalar)
+ r2IsInfinity := scalarIsZero(scalarReduced)
+ r2.p256ScalarMult(p256GetMultiplier(scalar))
+
+ var sum, double p256Point
+ pointsEqual := p256PointAddAsm(&sum, &r1, &r2)
+ p256PointDoubleAsm(&double, &r1)
+ p256MovCond(&sum, &double, &sum, pointsEqual)
+ p256MovCond(&sum, &r1, &sum, r2IsInfinity)
+ p256MovCond(&sum, &r2, &sum, r1IsInfinity)
+ return sum.p256PointToAffine()
+}
+
+func (curve p256CurveFast) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
+ var r p256Point
+ r.p256BaseMult(p256GetMultiplier(scalar))
+ return r.p256PointToAffine()
+}
+
+func (curve p256CurveFast) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
+ var r p256Point
+ copy(r.x[:], fromBig(maybeReduceModP(bigX)))
+ copy(r.y[:], fromBig(maybeReduceModP(bigY)))
+ copy(r.z[:], one)
+ p256MulAsm(r.x[:], r.x[:], rr[:])
+ p256MulAsm(r.y[:], r.y[:], rr[:])
+ r.p256ScalarMult(p256GetMultiplier(scalar))
+ return r.p256PointToAffine()
+}
+
+// scalarIsZero returns 1 if scalar represents the zero value, and zero
+// otherwise.
+func scalarIsZero(scalar []byte) int {
+ b := byte(0)
+ for _, s := range scalar {
+ b |= s
+ }
+ return subtle.ConstantTimeByteEq(b, 0)
+}
+
+func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
+ zInv := make([]byte, 32)
+ zInvSq := make([]byte, 32)
+
+ p256Inverse(zInv, p.z[:])
+ p256Sqr(zInvSq, zInv)
+ p256MulAsm(zInv, zInv, zInvSq)
+
+ p256MulAsm(zInvSq, p.x[:], zInvSq)
+ p256MulAsm(zInv, p.y[:], zInv)
+
+ p256FromMont(zInvSq, zInvSq)
+ p256FromMont(zInv, zInv)
+
+ return new(big.Int).SetBytes(zInvSq), new(big.Int).SetBytes(zInv)
+}
+
+// p256Inverse sets out to in^-1 mod p.
+func p256Inverse(out, in []byte) {
+ var stack [6 * 32]byte
+ p2 := stack[32*0 : 32*0+32]
+ p4 := stack[32*1 : 32*1+32]
+ p8 := stack[32*2 : 32*2+32]
+ p16 := stack[32*3 : 32*3+32]
+ p32 := stack[32*4 : 32*4+32]
+
+ p256Sqr(out, in)
+ p256MulAsm(p2, out, in) // 3*p
+
+ p256Sqr(out, p2)
+ p256Sqr(out, out)
+ p256MulAsm(p4, out, p2) // f*p
+
+ p256Sqr(out, p4)
+ p256Sqr(out, out)
+ p256Sqr(out, out)
+ p256Sqr(out, out)
+ p256MulAsm(p8, out, p4) // ff*p
+
+ p256Sqr(out, p8)
+
+ for i := 0; i < 7; i++ {
+ p256Sqr(out, out)
+ }
+ p256MulAsm(p16, out, p8) // ffff*p
+
+ p256Sqr(out, p16)
+ for i := 0; i < 15; i++ {
+ p256Sqr(out, out)
+ }
+ p256MulAsm(p32, out, p16) // ffffffff*p
+
+ p256Sqr(out, p32)
+
+ for i := 0; i < 31; i++ {
+ p256Sqr(out, out)
+ }
+ p256MulAsm(out, out, in)
+
+ for i := 0; i < 32*4; i++ {
+ p256Sqr(out, out)
+ }
+ p256MulAsm(out, out, p32)
+
+ for i := 0; i < 32; i++ {
+ p256Sqr(out, out)
+ }
+ p256MulAsm(out, out, p32)
+
+ for i := 0; i < 16; i++ {
+ p256Sqr(out, out)
+ }
+ p256MulAsm(out, out, p16)
+
+ for i := 0; i < 8; i++ {
+ p256Sqr(out, out)
+ }
+ p256MulAsm(out, out, p8)
+
+ p256Sqr(out, out)
+ p256Sqr(out, out)
+ p256Sqr(out, out)
+ p256Sqr(out, out)
+ p256MulAsm(out, out, p4)
+
+ p256Sqr(out, out)
+ p256Sqr(out, out)
+ p256MulAsm(out, out, p2)
+
+ p256Sqr(out, out)
+ p256Sqr(out, out)
+ p256MulAsm(out, out, in)
+}
+
+func boothW5(in uint) (int, int) {
+ var s uint = ^((in >> 5) - 1)
+ var d uint = (1 << 6) - in - 1
+ d = (d & s) | (in & (^s))
+ d = (d >> 1) + (d & 1)
+ return int(d), int(s & 1)
+}
+
+func boothW7(in uint) (int, int) {
+ var s uint = ^((in >> 7) - 1)
+ var d uint = (1 << 8) - in - 1
+ d = (d & s) | (in & (^s))
+ d = (d >> 1) + (d & 1)
+ return int(d), int(s & 1)
+}
+
+func initTable() {
+ p256PreFast = new([37][64]p256Point) //z coordinate not used
+ basePoint := p256Point{
+ x: [32]byte{0x18, 0x90, 0x5f, 0x76, 0xa5, 0x37, 0x55, 0xc6, 0x79, 0xfb, 0x73, 0x2b, 0x77, 0x62, 0x25, 0x10,
+ 0x75, 0xba, 0x95, 0xfc, 0x5f, 0xed, 0xb6, 0x01, 0x79, 0xe7, 0x30, 0xd4, 0x18, 0xa9, 0x14, 0x3c}, //(p256.x*2^256)%p
+ y: [32]byte{0x85, 0x71, 0xff, 0x18, 0x25, 0x88, 0x5d, 0x85, 0xd2, 0xe8, 0x86, 0x88, 0xdd, 0x21, 0xf3, 0x25,
+ 0x8b, 0x4a, 0xb8, 0xe4, 0xba, 0x19, 0xe4, 0x5c, 0xdd, 0xf2, 0x53, 0x57, 0xce, 0x95, 0x56, 0x0a}, //(p256.y*2^256)%p
+ z: [32]byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}, //(p256.z*2^256)%p
+ }
+
+ t1 := new(p256Point)
+ t2 := new(p256Point)
+ *t2 = basePoint
+
+ zInv := make([]byte, 32)
+ zInvSq := make([]byte, 32)
+ for j := 0; j < 64; j++ {
+ *t1 = *t2
+ for i := 0; i < 37; i++ {
+ // The window size is 7 so we need to double 7 times.
+ if i != 0 {
+ for k := 0; k < 7; k++ {
+ p256PointDoubleAsm(t1, t1)
+ }
+ }
+ // Convert the point to affine form. (Its values are
+ // still in Montgomery form however.)
+ p256Inverse(zInv, t1.z[:])
+ p256Sqr(zInvSq, zInv)
+ p256MulAsm(zInv, zInv, zInvSq)
+
+ p256MulAsm(t1.x[:], t1.x[:], zInvSq)
+ p256MulAsm(t1.y[:], t1.y[:], zInv)
+
+ copy(t1.z[:], basePoint.z[:])
+ // Update the table entry
+ copy(p256PreFast[i][j].x[:], t1.x[:])
+ copy(p256PreFast[i][j].y[:], t1.y[:])
+ }
+ if j == 0 {
+ p256PointDoubleAsm(t2, &basePoint)
+ } else {
+ p256PointAddAsm(t2, t2, &basePoint)
+ }
+ }
+}
+
+func (p *p256Point) p256BaseMult(scalar []byte) {
+ wvalue := (uint(scalar[31]) << 1) & 0xff
+ sel, sign := boothW7(uint(wvalue))
+ p256SelectBase(p, p256PreFast[0][:], sel)
+ p256NegCond(p, sign)
+
+ copy(p.z[:], one[:])
+ var t0 p256Point
+
+ copy(t0.z[:], one[:])
+
+ index := uint(6)
+ zero := sel
+
+ for i := 1; i < 37; i++ {
+ if index < 247 {
+ wvalue = ((uint(scalar[31-index/8]) >> (index % 8)) + (uint(scalar[31-index/8-1]) << (8 - (index % 8)))) & 0xff
+ } else {
+ wvalue = (uint(scalar[31-index/8]) >> (index % 8)) & 0xff
+ }
+ index += 7
+ sel, sign = boothW7(uint(wvalue))
+ p256SelectBase(&t0, p256PreFast[i][:], sel)
+ p256PointAddAffineAsm(p, p, &t0, sign, sel, zero)
+ zero |= sel
+ }
+}
+
+func (p *p256Point) p256ScalarMult(scalar []byte) {
+ // precomp is a table of precomputed points that stores powers of p
+ // from p^1 to p^16.
+ var precomp [16]p256Point
+ var t0, t1, t2, t3 p256Point
+
+ // Prepare the table
+ *&precomp[0] = *p
+
+ p256PointDoubleAsm(&t0, p)
+ p256PointDoubleAsm(&t1, &t0)
+ p256PointDoubleAsm(&t2, &t1)
+ p256PointDoubleAsm(&t3, &t2)
+ *&precomp[1] = t0 // 2
+ *&precomp[3] = t1 // 4
+ *&precomp[7] = t2 // 8
+ *&precomp[15] = t3 // 16
+
+ p256PointAddAsm(&t0, &t0, p)
+ p256PointAddAsm(&t1, &t1, p)
+ p256PointAddAsm(&t2, &t2, p)
+ *&precomp[2] = t0 // 3
+ *&precomp[4] = t1 // 5
+ *&precomp[8] = t2 // 9
+
+ p256PointDoubleAsm(&t0, &t0)
+ p256PointDoubleAsm(&t1, &t1)
+ *&precomp[5] = t0 // 6
+ *&precomp[9] = t1 // 10
+
+ p256PointAddAsm(&t2, &t0, p)
+ p256PointAddAsm(&t1, &t1, p)
+ *&precomp[6] = t2 // 7
+ *&precomp[10] = t1 // 11
+
+ p256PointDoubleAsm(&t0, &t0)
+ p256PointDoubleAsm(&t2, &t2)
+ *&precomp[11] = t0 // 12
+ *&precomp[13] = t2 // 14
+
+ p256PointAddAsm(&t0, &t0, p)
+ p256PointAddAsm(&t2, &t2, p)
+ *&precomp[12] = t0 // 13
+ *&precomp[14] = t2 // 15
+
+ // Start scanning the window from top bit
+ index := uint(254)
+ var sel, sign int
+
+ wvalue := (uint(scalar[31-index/8]) >> (index % 8)) & 0x3f
+ sel, _ = boothW5(uint(wvalue))
+ p256Select(p, precomp[:], sel)
+ zero := sel
+
+ for index > 4 {
+ index -= 5
+ p256PointDoubleAsm(p, p)
+ p256PointDoubleAsm(p, p)
+ p256PointDoubleAsm(p, p)
+ p256PointDoubleAsm(p, p)
+ p256PointDoubleAsm(p, p)
+
+ if index < 247 {
+ wvalue = ((uint(scalar[31-index/8]) >> (index % 8)) + (uint(scalar[31-index/8-1]) << (8 - (index % 8)))) & 0x3f
+ } else {
+ wvalue = (uint(scalar[31-index/8]) >> (index % 8)) & 0x3f
+ }
+
+ sel, sign = boothW5(uint(wvalue))
+
+ p256Select(&t0, precomp[:], sel)
+ p256NegCond(&t0, sign)
+ p256PointAddAsm(&t1, p, &t0)
+ p256MovCond(&t1, &t1, p, sel)
+ p256MovCond(p, &t1, &t0, zero)
+ zero |= sel
+ }
+
+ p256PointDoubleAsm(p, p)
+ p256PointDoubleAsm(p, p)
+ p256PointDoubleAsm(p, p)
+ p256PointDoubleAsm(p, p)
+ p256PointDoubleAsm(p, p)
+
+ wvalue = (uint(scalar[31]) << 1) & 0x3f
+ sel, sign = boothW5(uint(wvalue))
+
+ p256Select(&t0, precomp[:], sel)
+ p256NegCond(&t0, sign)
+ p256PointAddAsm(&t1, p, &t0)
+ p256MovCond(&t1, &t1, p, sel)
+ p256MovCond(p, &t1, &t0, zero)
+}