diff options
Diffstat (limited to 'src/crypto/sha1')
-rw-r--r-- | src/crypto/sha1/boring.go | 25 | ||||
-rw-r--r-- | src/crypto/sha1/example_test.go | 42 | ||||
-rw-r--r-- | src/crypto/sha1/fallback_test.go | 34 | ||||
-rw-r--r-- | src/crypto/sha1/issue15617_test.go | 27 | ||||
-rw-r--r-- | src/crypto/sha1/notboring.go | 20 | ||||
-rw-r--r-- | src/crypto/sha1/sha1.go | 264 | ||||
-rw-r--r-- | src/crypto/sha1/sha1_test.go | 274 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block.go | 83 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_386.s | 233 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_amd64.go | 34 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_amd64.s | 1500 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_arm.s | 217 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_arm64.go | 26 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_arm64.s | 152 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_decl.go | 11 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_generic.go | 11 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_s390x.go | 9 | ||||
-rw-r--r-- | src/crypto/sha1/sha1block_s390x.s | 20 |
18 files changed, 2982 insertions, 0 deletions
diff --git a/src/crypto/sha1/boring.go b/src/crypto/sha1/boring.go new file mode 100644 index 0000000..b5786d1 --- /dev/null +++ b/src/crypto/sha1/boring.go @@ -0,0 +1,25 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Extra indirection here so that when building go_bootstrap +// cmd/internal/boring is not even imported, so that we don't +// have to maintain changes to cmd/dist's deps graph. + +//go:build !cmd_go_bootstrap && cgo +// +build !cmd_go_bootstrap,cgo + +package sha1 + +import ( + "crypto/internal/boring" + "hash" +) + +const boringEnabled = boring.Enabled + +func boringNewSHA1() hash.Hash { return boring.NewSHA1() } + +func boringUnreachable() { boring.Unreachable() } + +func boringSHA1(p []byte) [20]byte { return boring.SHA1(p) } diff --git a/src/crypto/sha1/example_test.go b/src/crypto/sha1/example_test.go new file mode 100644 index 0000000..499055c --- /dev/null +++ b/src/crypto/sha1/example_test.go @@ -0,0 +1,42 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sha1_test + +import ( + "crypto/sha1" + "fmt" + "io" + "log" + "os" +) + +func ExampleNew() { + h := sha1.New() + io.WriteString(h, "His money is twice tainted:") + io.WriteString(h, " 'taint yours and 'taint mine.") + fmt.Printf("% x", h.Sum(nil)) + // Output: 59 7f 6a 54 00 10 f9 4c 15 d7 18 06 a9 9a 2c 87 10 e7 47 bd +} + +func ExampleSum() { + data := []byte("This page intentionally left blank.") + fmt.Printf("% x", sha1.Sum(data)) + // Output: af 06 49 23 bb f2 30 15 96 aa c4 c2 73 ba 32 17 8e bc 4a 96 +} + +func ExampleNew_file() { + f, err := os.Open("file.txt") + if err != nil { + log.Fatal(err) + } + defer f.Close() + + h := sha1.New() + if _, err := io.Copy(h, f); err != nil { + log.Fatal(err) + } + + fmt.Printf("% x", h.Sum(nil)) +} diff --git a/src/crypto/sha1/fallback_test.go b/src/crypto/sha1/fallback_test.go new file mode 100644 index 0000000..45d1f57 --- /dev/null +++ b/src/crypto/sha1/fallback_test.go @@ -0,0 +1,34 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build s390x + +package sha1 + +import ( + "fmt" + "io" + "testing" +) + +// Tests the fallback code path in case the optimized asm +// implementation cannot be used. +// See also TestBlockGeneric. +func TestGenericPath(t *testing.T) { + if useAsm == false { + t.Skipf("assembly implementation unavailable") + } + useAsm = false + defer func() { useAsm = true }() + c := New() + in := "ΑΒΓΔΕϜΖΗΘΙΚΛΜΝΞΟΠϺϘΡΣΤΥΦΧΨΩ" + gold := "0f58c2bb130f8182375f325c18342215255387e5" + if _, err := io.WriteString(c, in); err != nil { + t.Fatalf("could not write to c: %v", err) + } + out := fmt.Sprintf("%x", c.Sum(nil)) + if out != gold { + t.Fatalf("mismatch: got %s, wanted %s", out, gold) + } +} diff --git a/src/crypto/sha1/issue15617_test.go b/src/crypto/sha1/issue15617_test.go new file mode 100644 index 0000000..116c78f --- /dev/null +++ b/src/crypto/sha1/issue15617_test.go @@ -0,0 +1,27 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 && (linux || darwin) + +package sha1_test + +import ( + "crypto/sha1" + "syscall" + "testing" +) + +func TestOutOfBoundsRead(t *testing.T) { + const pageSize = 4 << 10 + data, err := syscall.Mmap(0, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANON|syscall.MAP_PRIVATE) + if err != nil { + panic(err) + } + if err := syscall.Mprotect(data[pageSize:], syscall.PROT_NONE); err != nil { + panic(err) + } + for i := 0; i < pageSize; i++ { + sha1.Sum(data[pageSize-i : pageSize]) + } +} diff --git a/src/crypto/sha1/notboring.go b/src/crypto/sha1/notboring.go new file mode 100644 index 0000000..42ef879 --- /dev/null +++ b/src/crypto/sha1/notboring.go @@ -0,0 +1,20 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build cmd_go_bootstrap || !cgo +// +build cmd_go_bootstrap !cgo + +package sha1 + +import ( + "hash" +) + +const boringEnabled = false + +func boringNewSHA1() hash.Hash { panic("boringcrypto: not available") } + +func boringUnreachable() {} + +func boringSHA1([]byte) [20]byte { panic("boringcrypto: not available") } diff --git a/src/crypto/sha1/sha1.go b/src/crypto/sha1/sha1.go new file mode 100644 index 0000000..43ab72a --- /dev/null +++ b/src/crypto/sha1/sha1.go @@ -0,0 +1,264 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package sha1 implements the SHA-1 hash algorithm as defined in RFC 3174. +// +// SHA-1 is cryptographically broken and should not be used for secure +// applications. +package sha1 + +import ( + "crypto" + "encoding/binary" + "errors" + "hash" +) + +func init() { + crypto.RegisterHash(crypto.SHA1, New) +} + +// The size of a SHA-1 checksum in bytes. +const Size = 20 + +// The blocksize of SHA-1 in bytes. +const BlockSize = 64 + +const ( + chunk = 64 + init0 = 0x67452301 + init1 = 0xEFCDAB89 + init2 = 0x98BADCFE + init3 = 0x10325476 + init4 = 0xC3D2E1F0 +) + +// digest represents the partial evaluation of a checksum. +type digest struct { + h [5]uint32 + x [chunk]byte + nx int + len uint64 +} + +const ( + magic = "sha\x01" + marshaledSize = len(magic) + 5*4 + chunk + 8 +) + +func (d *digest) MarshalBinary() ([]byte, error) { + b := make([]byte, 0, marshaledSize) + b = append(b, magic...) + b = binary.BigEndian.AppendUint32(b, d.h[0]) + b = binary.BigEndian.AppendUint32(b, d.h[1]) + b = binary.BigEndian.AppendUint32(b, d.h[2]) + b = binary.BigEndian.AppendUint32(b, d.h[3]) + b = binary.BigEndian.AppendUint32(b, d.h[4]) + b = append(b, d.x[:d.nx]...) + b = b[:len(b)+len(d.x)-d.nx] // already zero + b = binary.BigEndian.AppendUint64(b, d.len) + return b, nil +} + +func (d *digest) UnmarshalBinary(b []byte) error { + if len(b) < len(magic) || string(b[:len(magic)]) != magic { + return errors.New("crypto/sha1: invalid hash state identifier") + } + if len(b) != marshaledSize { + return errors.New("crypto/sha1: invalid hash state size") + } + b = b[len(magic):] + b, d.h[0] = consumeUint32(b) + b, d.h[1] = consumeUint32(b) + b, d.h[2] = consumeUint32(b) + b, d.h[3] = consumeUint32(b) + b, d.h[4] = consumeUint32(b) + b = b[copy(d.x[:], b):] + b, d.len = consumeUint64(b) + d.nx = int(d.len % chunk) + return nil +} + +func consumeUint64(b []byte) ([]byte, uint64) { + _ = b[7] + x := uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | + uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56 + return b[8:], x +} + +func consumeUint32(b []byte) ([]byte, uint32) { + _ = b[3] + x := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 + return b[4:], x +} + +func (d *digest) Reset() { + d.h[0] = init0 + d.h[1] = init1 + d.h[2] = init2 + d.h[3] = init3 + d.h[4] = init4 + d.nx = 0 + d.len = 0 +} + +// New returns a new hash.Hash computing the SHA1 checksum. The Hash also +// implements encoding.BinaryMarshaler and encoding.BinaryUnmarshaler to +// marshal and unmarshal the internal state of the hash. +func New() hash.Hash { + if boringEnabled { + return boringNewSHA1() + } + d := new(digest) + d.Reset() + return d +} + +func (d *digest) Size() int { return Size } + +func (d *digest) BlockSize() int { return BlockSize } + +func (d *digest) Write(p []byte) (nn int, err error) { + boringUnreachable() + nn = len(p) + d.len += uint64(nn) + if d.nx > 0 { + n := copy(d.x[d.nx:], p) + d.nx += n + if d.nx == chunk { + block(d, d.x[:]) + d.nx = 0 + } + p = p[n:] + } + if len(p) >= chunk { + n := len(p) &^ (chunk - 1) + block(d, p[:n]) + p = p[n:] + } + if len(p) > 0 { + d.nx = copy(d.x[:], p) + } + return +} + +func (d *digest) Sum(in []byte) []byte { + boringUnreachable() + // Make a copy of d so that caller can keep writing and summing. + d0 := *d + hash := d0.checkSum() + return append(in, hash[:]...) +} + +func (d *digest) checkSum() [Size]byte { + len := d.len + // Padding. Add a 1 bit and 0 bits until 56 bytes mod 64. + var tmp [64 + 8]byte // padding + length buffer + tmp[0] = 0x80 + var t uint64 + if len%64 < 56 { + t = 56 - len%64 + } else { + t = 64 + 56 - len%64 + } + + // Length in bits. + len <<= 3 + padlen := tmp[:t+8] + binary.BigEndian.PutUint64(padlen[t:], len) + d.Write(padlen) + + if d.nx != 0 { + panic("d.nx != 0") + } + + var digest [Size]byte + + binary.BigEndian.PutUint32(digest[0:], d.h[0]) + binary.BigEndian.PutUint32(digest[4:], d.h[1]) + binary.BigEndian.PutUint32(digest[8:], d.h[2]) + binary.BigEndian.PutUint32(digest[12:], d.h[3]) + binary.BigEndian.PutUint32(digest[16:], d.h[4]) + + return digest +} + +// ConstantTimeSum computes the same result of Sum() but in constant time +func (d *digest) ConstantTimeSum(in []byte) []byte { + d0 := *d + hash := d0.constSum() + return append(in, hash[:]...) +} + +func (d *digest) constSum() [Size]byte { + var length [8]byte + l := d.len << 3 + for i := uint(0); i < 8; i++ { + length[i] = byte(l >> (56 - 8*i)) + } + + nx := byte(d.nx) + t := nx - 56 // if nx < 56 then the MSB of t is one + mask1b := byte(int8(t) >> 7) // mask1b is 0xFF iff one block is enough + + separator := byte(0x80) // gets reset to 0x00 once used + for i := byte(0); i < chunk; i++ { + mask := byte(int8(i-nx) >> 7) // 0x00 after the end of data + + // if we reached the end of the data, replace with 0x80 or 0x00 + d.x[i] = (^mask & separator) | (mask & d.x[i]) + + // zero the separator once used + separator &= mask + + if i >= 56 { + // we might have to write the length here if all fit in one block + d.x[i] |= mask1b & length[i-56] + } + } + + // compress, and only keep the digest if all fit in one block + block(d, d.x[:]) + + var digest [Size]byte + for i, s := range d.h { + digest[i*4] = mask1b & byte(s>>24) + digest[i*4+1] = mask1b & byte(s>>16) + digest[i*4+2] = mask1b & byte(s>>8) + digest[i*4+3] = mask1b & byte(s) + } + + for i := byte(0); i < chunk; i++ { + // second block, it's always past the end of data, might start with 0x80 + if i < 56 { + d.x[i] = separator + separator = 0 + } else { + d.x[i] = length[i-56] + } + } + + // compress, and only keep the digest if we actually needed the second block + block(d, d.x[:]) + + for i, s := range d.h { + digest[i*4] |= ^mask1b & byte(s>>24) + digest[i*4+1] |= ^mask1b & byte(s>>16) + digest[i*4+2] |= ^mask1b & byte(s>>8) + digest[i*4+3] |= ^mask1b & byte(s) + } + + return digest +} + +// Sum returns the SHA-1 checksum of the data. +func Sum(data []byte) [Size]byte { + if boringEnabled { + return boringSHA1(data) + } + var d digest + d.Reset() + d.Write(data) + return d.checkSum() +} diff --git a/src/crypto/sha1/sha1_test.go b/src/crypto/sha1/sha1_test.go new file mode 100644 index 0000000..85ed126 --- /dev/null +++ b/src/crypto/sha1/sha1_test.go @@ -0,0 +1,274 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// SHA-1 hash algorithm. See RFC 3174. + +package sha1 + +import ( + "bytes" + "crypto/internal/boring" + "crypto/rand" + "encoding" + "fmt" + "hash" + "io" + "testing" +) + +type sha1Test struct { + out string + in string + halfState string // marshaled hash state after first half of in written, used by TestGoldenMarshal +} + +var golden = []sha1Test{ + {"76245dbf96f661bd221046197ab8b9f063f11bad", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n", "sha\x01\v\xa0)I\xdeq(8h\x9ev\xe5\x88[\xf8\x81\x17\xba4Daaaaaaaaaaaaaaaaaaaaaa\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x96"}, + {"da39a3ee5e6b4b0d3255bfef95601890afd80709", "", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"86f7e437faa5a7fce15d1ddcb9eaeaea377667b8", "a", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"}, + {"da23614e02469a0d7c7bd1bdab5c9c474b1904dc", "ab", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"}, + {"a9993e364706816aba3e25717850c26c9cd0d89d", "abc", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"}, + {"81fe8bfe87576c3ecb22426f8e57847382917acf", "abcd", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0ab\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"}, + {"03de6c570bfe24bfc328ccd7ca46b76eadaf4334", "abcde", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0ab\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"}, + {"1f8ac10f23c5b5bc1167bda84b833e5c057a77d2", "abcdef", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"}, + {"2fb5e13419fc89246865e7a324f476ec624e8740", "abcdefg", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"}, + {"425af12a0743502b322e93a015bcf868e324d56a", "abcdefgh", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abcd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04"}, + {"c63b19f1e4c8b5f76b25c49b8b87f57d8e4872a1", "abcdefghi", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abcd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04"}, + {"d68c19a0a345b7eab78d5e11e991c026ec60db63", "abcdefghij", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abcde\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05"}, + {"ebf81ddcbe5bf13aaabdc4d65354fdf2044f38a7", "Discard medicine more than two years old.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Discard medicine mor\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14"}, + {"e5dea09392dd886ca63531aaa00571dc07554bb6", "He who has a shady past knows that nice guys finish last.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0He who has a shady past know\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c"}, + {"45988f7234467b94e3e9494434c96ee3609d8f8f", "I wouldn't marry him with a ten foot pole.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0I wouldn't marry him \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x15"}, + {"55dee037eb7460d5a692d1ce11330b260e40c988", "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Free! Free!/A trip/to Mars/f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c"}, + {"b7bc5fb91080c7de6b582ea281f8a396d7c0aee8", "The days of the digital watch are numbered. -Tom Stoppard", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0The days of the digital watch\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1d"}, + {"c3aed9358f7c77f523afe86135f06b95b3999797", "Nepal premier won't resign.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Nepal premier\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\r"}, + {"6e29d302bf6e3a5e4305ff318d983197d6906bb9", "For every action there is an equal and opposite government program.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0For every action there is an equa\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00!"}, + {"597f6a540010f94c15d71806a99a2c8710e747bd", "His money is twice tainted: 'taint yours and 'taint mine.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0His money is twice tainted: \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c"}, + {"6859733b2590a8a091cecf50086febc5ceef1e80", "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0There is no reason for any individual to hav\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,"}, + {"514b2630ec089b8aee18795fc0cf1f4860cdacad", "It's a tiny change to the code and not completely disgusting. - Bob Manchek", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0It's a tiny change to the code and no\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00%"}, + {"c5ca0d4a7b6676fc7aa72caa41cc3d5df567ed69", "size: a.out: bad magic", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0size: a.out\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\f"}, + {"74c51fa9a04eadc8c1bbeaa7fc442f834b90a00a", "The major problem is with sendmail. -Mark Horton", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0The major problem is wit\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18"}, + {"0b4c4ce5f52c3ad2821852a8dc00217fa18b8b66", "Give me a rock, paper and scissors and I will move the world. CCFestoon", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Give me a rock, paper and scissors a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$"}, + {"3ae7937dd790315beb0f48330e8642237c61550a", "If the enemy is within range, then so are you.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0If the enemy is within \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17"}, + {"410a2b296df92b9a47412b13281df8f830a9f44b", "It's well we cannot hear the screams/That we create in others' dreams.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0It's well we cannot hear the scream\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00#"}, + {"841e7c85ca1adcddbdd0187f1289acb5c642f7f5", "You remind me of a TV show, but that's all right: I watch it anyway.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0You remind me of a TV show, but th\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\""}, + {"163173b825d03b952601376b25212df66763e1db", "C is as portable as Stonehedge!!", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0C is as portable\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10"}, + {"32b0377f2687eb88e22106f133c586ab314d5279", "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Even if I could be Shakespeare, I think I sh\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,"}, + {"0885aaf99b569542fd165fa44e322718f4a984e0", "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule", "sha\x01x}\xf4\r\xeb\xf2\x10\x87\xe8[\xb2JA$D\xb7\u063ax8em\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00B"}, + {"6627d6904d71420b0bf3886ab629623538689f45", "How can you write a big system without C++? -Paul Glick", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0How can you write a big syst\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c"}, +} + +func TestGolden(t *testing.T) { + for i := 0; i < len(golden); i++ { + g := golden[i] + s := fmt.Sprintf("%x", Sum([]byte(g.in))) + if s != g.out { + t.Fatalf("Sum function: sha1(%s) = %s want %s", g.in, s, g.out) + } + c := New() + for j := 0; j < 4; j++ { + var sum []byte + switch j { + case 0, 1: + io.WriteString(c, g.in) + sum = c.Sum(nil) + case 2: + io.WriteString(c, g.in[0:len(g.in)/2]) + c.Sum(nil) + io.WriteString(c, g.in[len(g.in)/2:]) + sum = c.Sum(nil) + case 3: + if boring.Enabled { + continue + } + io.WriteString(c, g.in[0:len(g.in)/2]) + c.(*digest).ConstantTimeSum(nil) + io.WriteString(c, g.in[len(g.in)/2:]) + sum = c.(*digest).ConstantTimeSum(nil) + } + s := fmt.Sprintf("%x", sum) + if s != g.out { + t.Fatalf("sha1[%d](%s) = %s want %s", j, g.in, s, g.out) + } + c.Reset() + } + } +} + +func TestGoldenMarshal(t *testing.T) { + h := New() + h2 := New() + for _, g := range golden { + h.Reset() + h2.Reset() + + io.WriteString(h, g.in[:len(g.in)/2]) + + state, err := h.(encoding.BinaryMarshaler).MarshalBinary() + if err != nil { + t.Errorf("could not marshal: %v", err) + continue + } + + if string(state) != g.halfState { + t.Errorf("sha1(%q) state = %+q, want %+q", g.in, state, g.halfState) + continue + } + + if err := h2.(encoding.BinaryUnmarshaler).UnmarshalBinary(state); err != nil { + t.Errorf("could not unmarshal: %v", err) + continue + } + + io.WriteString(h, g.in[len(g.in)/2:]) + io.WriteString(h2, g.in[len(g.in)/2:]) + + if actual, actual2 := h.Sum(nil), h2.Sum(nil); !bytes.Equal(actual, actual2) { + t.Errorf("sha1(%q) = 0x%x != marshaled 0x%x", g.in, actual, actual2) + } + } +} + +func TestSize(t *testing.T) { + c := New() + if got := c.Size(); got != Size { + t.Errorf("Size = %d; want %d", got, Size) + } +} + +func TestBlockSize(t *testing.T) { + c := New() + if got := c.BlockSize(); got != BlockSize { + t.Errorf("BlockSize = %d; want %d", got, BlockSize) + } +} + +// Tests that blockGeneric (pure Go) and block (in assembly for some architectures) match. +func TestBlockGeneric(t *testing.T) { + if boring.Enabled { + t.Skip("BoringCrypto doesn't expose digest") + } + for i := 1; i < 30; i++ { // arbitrary factor + gen, asm := New().(*digest), New().(*digest) + buf := make([]byte, BlockSize*i) + rand.Read(buf) + blockGeneric(gen, buf) + block(asm, buf) + if *gen != *asm { + t.Errorf("For %#v block and blockGeneric resulted in different states", buf) + } + } +} + +// Tests for unmarshaling hashes that have hashed a large amount of data +// The initial hash generation is omitted from the test, because it takes a long time. +// The test contains some already-generated states, and their expected sums +// Tests a problem that is outlined in GitHub issue #29543 +// The problem is triggered when an amount of data has been hashed for which +// the data length has a 1 in the 32nd bit. When casted to int, this changes +// the sign of the value, and causes the modulus operation to return a +// different result. +type unmarshalTest struct { + state string + sum string +} + +var largeUnmarshalTests = []unmarshalTest{ + // Data length: 7_102_415_735 + { + state: "sha\x01\x13\xbc\xfe\x83\x8c\xbd\xdfP\x1f\xd8ڿ<\x9eji8t\xe1\xa5@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuv\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xa7VCw", + sum: "bc6245c9959cc33e1c2592e5c9ea9b5d0431246c", + }, + // Data length: 6_565_544_823 + { + state: "sha\x01m;\x16\xa6R\xbe@\xa9nĈ\xf9S\x03\x00B\xc2\xdcv\xcf@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuv\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x87VCw", + sum: "8f2d1c0e4271768f35feb918bfe21ea1387a2072", + }, +} + +func safeSum(h hash.Hash) (sum []byte, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("sum panic: %v", r) + } + }() + + return h.Sum(nil), nil +} + +func TestLargeHashes(t *testing.T) { + for i, test := range largeUnmarshalTests { + + h := New() + if err := h.(encoding.BinaryUnmarshaler).UnmarshalBinary([]byte(test.state)); err != nil { + t.Errorf("test %d could not unmarshal: %v", i, err) + continue + } + + sum, err := safeSum(h) + if err != nil { + t.Errorf("test %d could not sum: %v", i, err) + continue + } + + if fmt.Sprintf("%x", sum) != test.sum { + t.Errorf("test %d sum mismatch: expect %s got %x", i, test.sum, sum) + } + } +} + +func TestAllocations(t *testing.T) { + if boring.Enabled { + t.Skip("BoringCrypto doesn't allocate the same way as stdlib") + } + in := []byte("hello, world!") + out := make([]byte, 0, Size) + h := New() + n := int(testing.AllocsPerRun(10, func() { + h.Reset() + h.Write(in) + out = h.Sum(out[:0]) + })) + if n > 0 { + t.Errorf("allocs = %d, want 0", n) + } +} + +var bench = New() +var buf = make([]byte, 8192) + +func benchmarkSize(b *testing.B, size int) { + sum := make([]byte, bench.Size()) + b.Run("New", func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(size)) + for i := 0; i < b.N; i++ { + bench.Reset() + bench.Write(buf[:size]) + bench.Sum(sum[:0]) + } + }) + b.Run("Sum", func(b *testing.B) { + b.ReportAllocs() + b.SetBytes(int64(size)) + for i := 0; i < b.N; i++ { + Sum(buf[:size]) + } + }) +} + +func BenchmarkHash8Bytes(b *testing.B) { + benchmarkSize(b, 8) +} + +func BenchmarkHash320Bytes(b *testing.B) { + benchmarkSize(b, 320) +} + +func BenchmarkHash1K(b *testing.B) { + benchmarkSize(b, 1024) +} + +func BenchmarkHash8K(b *testing.B) { + benchmarkSize(b, 8192) +} diff --git a/src/crypto/sha1/sha1block.go b/src/crypto/sha1/sha1block.go new file mode 100644 index 0000000..321d343 --- /dev/null +++ b/src/crypto/sha1/sha1block.go @@ -0,0 +1,83 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sha1 + +import ( + "math/bits" +) + +const ( + _K0 = 0x5A827999 + _K1 = 0x6ED9EBA1 + _K2 = 0x8F1BBCDC + _K3 = 0xCA62C1D6 +) + +// blockGeneric is a portable, pure Go version of the SHA-1 block step. +// It's used by sha1block_generic.go and tests. +func blockGeneric(dig *digest, p []byte) { + var w [16]uint32 + + h0, h1, h2, h3, h4 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4] + for len(p) >= chunk { + // Can interlace the computation of w with the + // rounds below if needed for speed. + for i := 0; i < 16; i++ { + j := i * 4 + w[i] = uint32(p[j])<<24 | uint32(p[j+1])<<16 | uint32(p[j+2])<<8 | uint32(p[j+3]) + } + + a, b, c, d, e := h0, h1, h2, h3, h4 + + // Each of the four 20-iteration rounds + // differs only in the computation of f and + // the choice of K (_K0, _K1, etc). + i := 0 + for ; i < 16; i++ { + f := b&c | (^b)&d + t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K0 + a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d + } + for ; i < 20; i++ { + tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf] + w[i&0xf] = tmp<<1 | tmp>>(32-1) + + f := b&c | (^b)&d + t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K0 + a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d + } + for ; i < 40; i++ { + tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf] + w[i&0xf] = tmp<<1 | tmp>>(32-1) + f := b ^ c ^ d + t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K1 + a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d + } + for ; i < 60; i++ { + tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf] + w[i&0xf] = tmp<<1 | tmp>>(32-1) + f := ((b | c) & d) | (b & c) + t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K2 + a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d + } + for ; i < 80; i++ { + tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf] + w[i&0xf] = tmp<<1 | tmp>>(32-1) + f := b ^ c ^ d + t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K3 + a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d + } + + h0 += a + h1 += b + h2 += c + h3 += d + h4 += e + + p = p[chunk:] + } + + dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4] = h0, h1, h2, h3, h4 +} diff --git a/src/crypto/sha1/sha1block_386.s b/src/crypto/sha1/sha1block_386.s new file mode 100644 index 0000000..34d023d --- /dev/null +++ b/src/crypto/sha1/sha1block_386.s @@ -0,0 +1,233 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// SHA-1 block routine. See sha1block.go for Go equivalent. +// +// There are 80 rounds of 4 types: +// - rounds 0-15 are type 1 and load data (ROUND1 macro). +// - rounds 16-19 are type 1 and do not load data (ROUND1x macro). +// - rounds 20-39 are type 2 and do not load data (ROUND2 macro). +// - rounds 40-59 are type 3 and do not load data (ROUND3 macro). +// - rounds 60-79 are type 4 and do not load data (ROUND4 macro). +// +// Each round loads or shuffles the data, then computes a per-round +// function of b, c, d, and then mixes the result into and rotates the +// five registers a, b, c, d, e holding the intermediate results. +// +// The register rotation is implemented by rotating the arguments to +// the round macros instead of by explicit move instructions. + +// Like sha1block_amd64.s, but we keep the data and limit pointers on the stack. +// To free up the word pointer (R10 on amd64, DI here), we add it to e during +// LOAD/SHUFFLE instead of during MIX. +// +// The stack holds the intermediate word array - 16 uint32s - at 0(SP) up to 64(SP). +// The saved a, b, c, d, e (R11 through R15 on amd64) are at 64(SP) up to 84(SP). +// The saved limit pointer (DI on amd64) is at 84(SP). +// The saved data pointer (SI on amd64) is at 88(SP). + +#define LOAD(index, e) \ + MOVL 88(SP), SI; \ + MOVL (index*4)(SI), DI; \ + BSWAPL DI; \ + MOVL DI, (index*4)(SP); \ + ADDL DI, e + +#define SHUFFLE(index, e) \ + MOVL (((index)&0xf)*4)(SP), DI; \ + XORL (((index-3)&0xf)*4)(SP), DI; \ + XORL (((index-8)&0xf)*4)(SP), DI; \ + XORL (((index-14)&0xf)*4)(SP), DI; \ + ROLL $1, DI; \ + MOVL DI, (((index)&0xf)*4)(SP); \ + ADDL DI, e + +#define FUNC1(a, b, c, d, e) \ + MOVL d, DI; \ + XORL c, DI; \ + ANDL b, DI; \ + XORL d, DI + +#define FUNC2(a, b, c, d, e) \ + MOVL b, DI; \ + XORL c, DI; \ + XORL d, DI + +#define FUNC3(a, b, c, d, e) \ + MOVL b, SI; \ + ORL c, SI; \ + ANDL d, SI; \ + MOVL b, DI; \ + ANDL c, DI; \ + ORL SI, DI + +#define FUNC4 FUNC2 + +#define MIX(a, b, c, d, e, const) \ + ROLL $30, b; \ + ADDL DI, e; \ + MOVL a, SI; \ + ROLL $5, SI; \ + LEAL const(e)(SI*1), e + +#define ROUND1(a, b, c, d, e, index) \ + LOAD(index, e); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND1x(a, b, c, d, e, index) \ + SHUFFLE(index, e); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND2(a, b, c, d, e, index) \ + SHUFFLE(index, e); \ + FUNC2(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x6ED9EBA1) + +#define ROUND3(a, b, c, d, e, index) \ + SHUFFLE(index, e); \ + FUNC3(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x8F1BBCDC) + +#define ROUND4(a, b, c, d, e, index) \ + SHUFFLE(index, e); \ + FUNC4(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0xCA62C1D6) + +// func block(dig *digest, p []byte) +TEXT ·block(SB),NOSPLIT,$92-16 + MOVL dig+0(FP), BP + MOVL p+4(FP), SI + MOVL p_len+8(FP), DX + SHRL $6, DX + SHLL $6, DX + + LEAL (SI)(DX*1), DI + MOVL (0*4)(BP), AX + MOVL (1*4)(BP), BX + MOVL (2*4)(BP), CX + MOVL (3*4)(BP), DX + MOVL (4*4)(BP), BP + + CMPL SI, DI + JEQ end + + MOVL DI, 84(SP) + +loop: + MOVL SI, 88(SP) + + MOVL AX, 64(SP) + MOVL BX, 68(SP) + MOVL CX, 72(SP) + MOVL DX, 76(SP) + MOVL BP, 80(SP) + + ROUND1(AX, BX, CX, DX, BP, 0) + ROUND1(BP, AX, BX, CX, DX, 1) + ROUND1(DX, BP, AX, BX, CX, 2) + ROUND1(CX, DX, BP, AX, BX, 3) + ROUND1(BX, CX, DX, BP, AX, 4) + ROUND1(AX, BX, CX, DX, BP, 5) + ROUND1(BP, AX, BX, CX, DX, 6) + ROUND1(DX, BP, AX, BX, CX, 7) + ROUND1(CX, DX, BP, AX, BX, 8) + ROUND1(BX, CX, DX, BP, AX, 9) + ROUND1(AX, BX, CX, DX, BP, 10) + ROUND1(BP, AX, BX, CX, DX, 11) + ROUND1(DX, BP, AX, BX, CX, 12) + ROUND1(CX, DX, BP, AX, BX, 13) + ROUND1(BX, CX, DX, BP, AX, 14) + ROUND1(AX, BX, CX, DX, BP, 15) + + ROUND1x(BP, AX, BX, CX, DX, 16) + ROUND1x(DX, BP, AX, BX, CX, 17) + ROUND1x(CX, DX, BP, AX, BX, 18) + ROUND1x(BX, CX, DX, BP, AX, 19) + + ROUND2(AX, BX, CX, DX, BP, 20) + ROUND2(BP, AX, BX, CX, DX, 21) + ROUND2(DX, BP, AX, BX, CX, 22) + ROUND2(CX, DX, BP, AX, BX, 23) + ROUND2(BX, CX, DX, BP, AX, 24) + ROUND2(AX, BX, CX, DX, BP, 25) + ROUND2(BP, AX, BX, CX, DX, 26) + ROUND2(DX, BP, AX, BX, CX, 27) + ROUND2(CX, DX, BP, AX, BX, 28) + ROUND2(BX, CX, DX, BP, AX, 29) + ROUND2(AX, BX, CX, DX, BP, 30) + ROUND2(BP, AX, BX, CX, DX, 31) + ROUND2(DX, BP, AX, BX, CX, 32) + ROUND2(CX, DX, BP, AX, BX, 33) + ROUND2(BX, CX, DX, BP, AX, 34) + ROUND2(AX, BX, CX, DX, BP, 35) + ROUND2(BP, AX, BX, CX, DX, 36) + ROUND2(DX, BP, AX, BX, CX, 37) + ROUND2(CX, DX, BP, AX, BX, 38) + ROUND2(BX, CX, DX, BP, AX, 39) + + ROUND3(AX, BX, CX, DX, BP, 40) + ROUND3(BP, AX, BX, CX, DX, 41) + ROUND3(DX, BP, AX, BX, CX, 42) + ROUND3(CX, DX, BP, AX, BX, 43) + ROUND3(BX, CX, DX, BP, AX, 44) + ROUND3(AX, BX, CX, DX, BP, 45) + ROUND3(BP, AX, BX, CX, DX, 46) + ROUND3(DX, BP, AX, BX, CX, 47) + ROUND3(CX, DX, BP, AX, BX, 48) + ROUND3(BX, CX, DX, BP, AX, 49) + ROUND3(AX, BX, CX, DX, BP, 50) + ROUND3(BP, AX, BX, CX, DX, 51) + ROUND3(DX, BP, AX, BX, CX, 52) + ROUND3(CX, DX, BP, AX, BX, 53) + ROUND3(BX, CX, DX, BP, AX, 54) + ROUND3(AX, BX, CX, DX, BP, 55) + ROUND3(BP, AX, BX, CX, DX, 56) + ROUND3(DX, BP, AX, BX, CX, 57) + ROUND3(CX, DX, BP, AX, BX, 58) + ROUND3(BX, CX, DX, BP, AX, 59) + + ROUND4(AX, BX, CX, DX, BP, 60) + ROUND4(BP, AX, BX, CX, DX, 61) + ROUND4(DX, BP, AX, BX, CX, 62) + ROUND4(CX, DX, BP, AX, BX, 63) + ROUND4(BX, CX, DX, BP, AX, 64) + ROUND4(AX, BX, CX, DX, BP, 65) + ROUND4(BP, AX, BX, CX, DX, 66) + ROUND4(DX, BP, AX, BX, CX, 67) + ROUND4(CX, DX, BP, AX, BX, 68) + ROUND4(BX, CX, DX, BP, AX, 69) + ROUND4(AX, BX, CX, DX, BP, 70) + ROUND4(BP, AX, BX, CX, DX, 71) + ROUND4(DX, BP, AX, BX, CX, 72) + ROUND4(CX, DX, BP, AX, BX, 73) + ROUND4(BX, CX, DX, BP, AX, 74) + ROUND4(AX, BX, CX, DX, BP, 75) + ROUND4(BP, AX, BX, CX, DX, 76) + ROUND4(DX, BP, AX, BX, CX, 77) + ROUND4(CX, DX, BP, AX, BX, 78) + ROUND4(BX, CX, DX, BP, AX, 79) + + ADDL 64(SP), AX + ADDL 68(SP), BX + ADDL 72(SP), CX + ADDL 76(SP), DX + ADDL 80(SP), BP + + MOVL 88(SP), SI + ADDL $64, SI + CMPL SI, 84(SP) + JB loop + +end: + MOVL dig+0(FP), DI + MOVL AX, (0*4)(DI) + MOVL BX, (1*4)(DI) + MOVL CX, (2*4)(DI) + MOVL DX, (3*4)(DI) + MOVL BP, (4*4)(DI) + RET diff --git a/src/crypto/sha1/sha1block_amd64.go b/src/crypto/sha1/sha1block_amd64.go new file mode 100644 index 0000000..039813d --- /dev/null +++ b/src/crypto/sha1/sha1block_amd64.go @@ -0,0 +1,34 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sha1 + +import "internal/cpu" + +//go:noescape +func blockAVX2(dig *digest, p []byte) + +//go:noescape +func blockAMD64(dig *digest, p []byte) + +var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI1 && cpu.X86.HasBMI2 + +func block(dig *digest, p []byte) { + if useAVX2 && len(p) >= 256 { + // blockAVX2 calculates sha1 for 2 block per iteration + // it also interleaves precalculation for next block. + // So it may read up-to 192 bytes past end of p + // We may add checks inside blockAVX2, but this will + // just turn it into a copy of blockAMD64, + // so call it directly, instead. + safeLen := len(p) - 128 + if safeLen%128 != 0 { + safeLen -= 64 + } + blockAVX2(dig, p[:safeLen]) + blockAMD64(dig, p[safeLen:]) + } else { + blockAMD64(dig, p) + } +} diff --git a/src/crypto/sha1/sha1block_amd64.s b/src/crypto/sha1/sha1block_amd64.s new file mode 100644 index 0000000..42f03fb --- /dev/null +++ b/src/crypto/sha1/sha1block_amd64.s @@ -0,0 +1,1500 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// AVX2 version by Intel, same algorithm as code in Linux kernel: +// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S +// Authors: +// Ilya Albrekht <ilya.albrekht@intel.com> +// Maxim Locktyukhin <maxim.locktyukhin@intel.com> +// Ronen Zohar <ronen.zohar@intel.com> +// Chandramouli Narayanan <mouli@linux.intel.com> + + +#include "textflag.h" + +// SHA-1 block routine. See sha1block.go for Go equivalent. +// +// There are 80 rounds of 4 types: +// - rounds 0-15 are type 1 and load data (ROUND1 macro). +// - rounds 16-19 are type 1 and do not load data (ROUND1x macro). +// - rounds 20-39 are type 2 and do not load data (ROUND2 macro). +// - rounds 40-59 are type 3 and do not load data (ROUND3 macro). +// - rounds 60-79 are type 4 and do not load data (ROUND4 macro). +// +// Each round loads or shuffles the data, then computes a per-round +// function of b, c, d, and then mixes the result into and rotates the +// five registers a, b, c, d, e holding the intermediate results. +// +// The register rotation is implemented by rotating the arguments to +// the round macros instead of by explicit move instructions. + +#define LOAD(index) \ + MOVL (index*4)(SI), R10; \ + BSWAPL R10; \ + MOVL R10, (index*4)(SP) + +#define SHUFFLE(index) \ + MOVL (((index)&0xf)*4)(SP), R10; \ + XORL (((index-3)&0xf)*4)(SP), R10; \ + XORL (((index-8)&0xf)*4)(SP), R10; \ + XORL (((index-14)&0xf)*4)(SP), R10; \ + ROLL $1, R10; \ + MOVL R10, (((index)&0xf)*4)(SP) + +#define FUNC1(a, b, c, d, e) \ + MOVL d, R9; \ + XORL c, R9; \ + ANDL b, R9; \ + XORL d, R9 + +#define FUNC2(a, b, c, d, e) \ + MOVL b, R9; \ + XORL c, R9; \ + XORL d, R9 + +#define FUNC3(a, b, c, d, e) \ + MOVL b, R8; \ + ORL c, R8; \ + ANDL d, R8; \ + MOVL b, R9; \ + ANDL c, R9; \ + ORL R8, R9 + +#define FUNC4 FUNC2 + +#define MIX(a, b, c, d, e, const) \ + ROLL $30, b; \ + ADDL R9, e; \ + MOVL a, R8; \ + ROLL $5, R8; \ + LEAL const(e)(R10*1), e; \ + ADDL R8, e + +#define ROUND1(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND1x(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND2(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC2(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x6ED9EBA1) + +#define ROUND3(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC3(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x8F1BBCDC) + +#define ROUND4(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC4(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0xCA62C1D6) + +TEXT ·blockAMD64(SB),NOSPLIT,$64-32 + MOVQ dig+0(FP), BP + MOVQ p_base+8(FP), SI + MOVQ p_len+16(FP), DX + SHRQ $6, DX + SHLQ $6, DX + + LEAQ (SI)(DX*1), DI + MOVL (0*4)(BP), AX + MOVL (1*4)(BP), BX + MOVL (2*4)(BP), CX + MOVL (3*4)(BP), DX + MOVL (4*4)(BP), BP + + CMPQ SI, DI + JEQ end + +loop: + MOVL AX, R11 + MOVL BX, R12 + MOVL CX, R13 + MOVL DX, R14 + MOVL BP, R15 + + ROUND1(AX, BX, CX, DX, BP, 0) + ROUND1(BP, AX, BX, CX, DX, 1) + ROUND1(DX, BP, AX, BX, CX, 2) + ROUND1(CX, DX, BP, AX, BX, 3) + ROUND1(BX, CX, DX, BP, AX, 4) + ROUND1(AX, BX, CX, DX, BP, 5) + ROUND1(BP, AX, BX, CX, DX, 6) + ROUND1(DX, BP, AX, BX, CX, 7) + ROUND1(CX, DX, BP, AX, BX, 8) + ROUND1(BX, CX, DX, BP, AX, 9) + ROUND1(AX, BX, CX, DX, BP, 10) + ROUND1(BP, AX, BX, CX, DX, 11) + ROUND1(DX, BP, AX, BX, CX, 12) + ROUND1(CX, DX, BP, AX, BX, 13) + ROUND1(BX, CX, DX, BP, AX, 14) + ROUND1(AX, BX, CX, DX, BP, 15) + + ROUND1x(BP, AX, BX, CX, DX, 16) + ROUND1x(DX, BP, AX, BX, CX, 17) + ROUND1x(CX, DX, BP, AX, BX, 18) + ROUND1x(BX, CX, DX, BP, AX, 19) + + ROUND2(AX, BX, CX, DX, BP, 20) + ROUND2(BP, AX, BX, CX, DX, 21) + ROUND2(DX, BP, AX, BX, CX, 22) + ROUND2(CX, DX, BP, AX, BX, 23) + ROUND2(BX, CX, DX, BP, AX, 24) + ROUND2(AX, BX, CX, DX, BP, 25) + ROUND2(BP, AX, BX, CX, DX, 26) + ROUND2(DX, BP, AX, BX, CX, 27) + ROUND2(CX, DX, BP, AX, BX, 28) + ROUND2(BX, CX, DX, BP, AX, 29) + ROUND2(AX, BX, CX, DX, BP, 30) + ROUND2(BP, AX, BX, CX, DX, 31) + ROUND2(DX, BP, AX, BX, CX, 32) + ROUND2(CX, DX, BP, AX, BX, 33) + ROUND2(BX, CX, DX, BP, AX, 34) + ROUND2(AX, BX, CX, DX, BP, 35) + ROUND2(BP, AX, BX, CX, DX, 36) + ROUND2(DX, BP, AX, BX, CX, 37) + ROUND2(CX, DX, BP, AX, BX, 38) + ROUND2(BX, CX, DX, BP, AX, 39) + + ROUND3(AX, BX, CX, DX, BP, 40) + ROUND3(BP, AX, BX, CX, DX, 41) + ROUND3(DX, BP, AX, BX, CX, 42) + ROUND3(CX, DX, BP, AX, BX, 43) + ROUND3(BX, CX, DX, BP, AX, 44) + ROUND3(AX, BX, CX, DX, BP, 45) + ROUND3(BP, AX, BX, CX, DX, 46) + ROUND3(DX, BP, AX, BX, CX, 47) + ROUND3(CX, DX, BP, AX, BX, 48) + ROUND3(BX, CX, DX, BP, AX, 49) + ROUND3(AX, BX, CX, DX, BP, 50) + ROUND3(BP, AX, BX, CX, DX, 51) + ROUND3(DX, BP, AX, BX, CX, 52) + ROUND3(CX, DX, BP, AX, BX, 53) + ROUND3(BX, CX, DX, BP, AX, 54) + ROUND3(AX, BX, CX, DX, BP, 55) + ROUND3(BP, AX, BX, CX, DX, 56) + ROUND3(DX, BP, AX, BX, CX, 57) + ROUND3(CX, DX, BP, AX, BX, 58) + ROUND3(BX, CX, DX, BP, AX, 59) + + ROUND4(AX, BX, CX, DX, BP, 60) + ROUND4(BP, AX, BX, CX, DX, 61) + ROUND4(DX, BP, AX, BX, CX, 62) + ROUND4(CX, DX, BP, AX, BX, 63) + ROUND4(BX, CX, DX, BP, AX, 64) + ROUND4(AX, BX, CX, DX, BP, 65) + ROUND4(BP, AX, BX, CX, DX, 66) + ROUND4(DX, BP, AX, BX, CX, 67) + ROUND4(CX, DX, BP, AX, BX, 68) + ROUND4(BX, CX, DX, BP, AX, 69) + ROUND4(AX, BX, CX, DX, BP, 70) + ROUND4(BP, AX, BX, CX, DX, 71) + ROUND4(DX, BP, AX, BX, CX, 72) + ROUND4(CX, DX, BP, AX, BX, 73) + ROUND4(BX, CX, DX, BP, AX, 74) + ROUND4(AX, BX, CX, DX, BP, 75) + ROUND4(BP, AX, BX, CX, DX, 76) + ROUND4(DX, BP, AX, BX, CX, 77) + ROUND4(CX, DX, BP, AX, BX, 78) + ROUND4(BX, CX, DX, BP, AX, 79) + + ADDL R11, AX + ADDL R12, BX + ADDL R13, CX + ADDL R14, DX + ADDL R15, BP + + ADDQ $64, SI + CMPQ SI, DI + JB loop + +end: + MOVQ dig+0(FP), DI + MOVL AX, (0*4)(DI) + MOVL BX, (1*4)(DI) + MOVL CX, (2*4)(DI) + MOVL DX, (3*4)(DI) + MOVL BP, (4*4)(DI) + RET + + +// This is the implementation using AVX2, BMI1 and BMI2. It is based on: +// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" +// From http://software.intel.com/en-us/articles +// (look for improving-the-performance-of-the-secure-hash-algorithm-1) +// This implementation is 2x unrolled, and interleaves vector instructions, +// used to precompute W, with scalar computation of current round +// for optimal scheduling. + +// Trivial helper macros. +#define UPDATE_HASH(A,TB,C,D,E) \ + ADDL (R9), A \ + MOVL A, (R9) \ + ADDL 4(R9), TB \ + MOVL TB, 4(R9) \ + ADDL 8(R9), C \ + MOVL C, 8(R9) \ + ADDL 12(R9), D \ + MOVL D, 12(R9) \ + ADDL 16(R9), E \ + MOVL E, 16(R9) + + + +// Helper macros for PRECALC, which does precomputations +#define PRECALC_0(OFFSET) \ + VMOVDQU OFFSET(R10),X0 + +#define PRECALC_1(OFFSET) \ + VINSERTI128 $1, OFFSET(R13), Y0, Y0 + +#define PRECALC_2(YREG) \ + VPSHUFB Y10, Y0, YREG + +#define PRECALC_4(YREG,K_OFFSET) \ + VPADDD K_OFFSET(R8), YREG, Y0 + +#define PRECALC_7(OFFSET) \ + VMOVDQU Y0, (OFFSET*2)(R14) + + +// Message scheduling pre-compute for rounds 0-15 +// R13 is a pointer to even 64-byte block +// R10 is a pointer to odd 64-byte block +// R14 is a pointer to temp buffer +// X0 is used as temp register +// YREG is clobbered as part of computation +// OFFSET chooses 16 byte chunk within a block +// R8 is a pointer to constants block +// K_OFFSET chooses K constants relevant to this round +// X10 holds swap mask +#define PRECALC_00_15(OFFSET,YREG) \ + PRECALC_0(OFFSET) \ + PRECALC_1(OFFSET) \ + PRECALC_2(YREG) \ + PRECALC_4(YREG,0x0) \ + PRECALC_7(OFFSET) + + +// Helper macros for PRECALC_16_31 +#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ + VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14] + VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3] + +#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ + VPXOR REG_SUB_8, REG, REG \ + VPXOR REG_SUB_16, Y0, Y0 + +#define PRECALC_18(REG) \ + VPXOR Y0, REG, REG \ + VPSLLDQ $12, REG, Y9 + +#define PRECALC_19(REG) \ + VPSLLD $1, REG, Y0 \ + VPSRLD $31, REG, REG + +#define PRECALC_20(REG) \ + VPOR REG, Y0, Y0 \ + VPSLLD $2, Y9, REG + +#define PRECALC_21(REG) \ + VPSRLD $30, Y9, Y9 \ + VPXOR REG, Y0, Y0 + +#define PRECALC_23(REG,K_OFFSET,OFFSET) \ + VPXOR Y9, Y0, REG \ + VPADDD K_OFFSET(R8), REG, Y0 \ + VMOVDQU Y0, (OFFSET)(R14) + +// Message scheduling pre-compute for rounds 16-31 +// calculating last 32 w[i] values in 8 XMM registers +// pre-calculate K+w[i] values and store to mem +// for later load by ALU add instruction. +// "brute force" vectorization for rounds 16-31 only +// due to w[i]->w[i-3] dependency. +// clobbers 5 input ymm registers REG_SUB* +// uses X0 and X9 as temp registers +// As always, R8 is a pointer to constants block +// and R14 is a pointer to temp buffer +#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \ + PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ + PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ + PRECALC_18(REG) \ + PRECALC_19(REG) \ + PRECALC_20(REG) \ + PRECALC_21(REG) \ + PRECALC_23(REG,K_OFFSET,OFFSET) + + +// Helper macros for PRECALC_32_79 +#define PRECALC_32(REG_SUB_8,REG_SUB_4) \ + VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0 + +#define PRECALC_33(REG_SUB_28,REG) \ + VPXOR REG_SUB_28, REG, REG + +#define PRECALC_34(REG_SUB_16) \ + VPXOR REG_SUB_16, Y0, Y0 + +#define PRECALC_35(REG) \ + VPXOR Y0, REG, REG + +#define PRECALC_36(REG) \ + VPSLLD $2, REG, Y0 + +#define PRECALC_37(REG) \ + VPSRLD $30, REG, REG \ + VPOR REG, Y0, REG + +#define PRECALC_39(REG,K_OFFSET,OFFSET) \ + VPADDD K_OFFSET(R8), REG, Y0 \ + VMOVDQU Y0, (OFFSET)(R14) + +// Message scheduling pre-compute for rounds 32-79 +// In SHA-1 specification we have: +// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 +// Which is the same as: +// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 +// This allows for more efficient vectorization, +// since w[i]->w[i-3] dependency is broken +#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \ + PRECALC_32(REG_SUB_8,REG_SUB_4) \ + PRECALC_33(REG_SUB_28,REG) \ + PRECALC_34(REG_SUB_16) \ + PRECALC_35(REG) \ + PRECALC_36(REG) \ + PRECALC_37(REG) \ + PRECALC_39(REG,K_OFFSET,OFFSET) + +#define PRECALC \ + PRECALC_00_15(0,Y15) \ + PRECALC_00_15(0x10,Y14) \ + PRECALC_00_15(0x20,Y13) \ + PRECALC_00_15(0x30,Y12) \ + PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \ + PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \ + PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \ + PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \ + PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \ + PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \ + PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \ + PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \ + PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \ + PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \ + PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \ + PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \ + PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \ + PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \ + PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \ + PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260) + +// Macros calculating individual rounds have general form +// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST +// CALC_ROUND_{PRE,POST} macros follow + +#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \ + ADDL OFFSET(R15),REG_E \ + ANDNL REG_C,REG_A,BP \ + LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round + RORXL $0x1b, REG_A, R12 \ + RORXL $2, REG_A, REG_B // for next round + +// Calculate F for the next round +#define CALC_F1_POST(REG_A,REG_B,REG_E) \ + ANDL REG_B,REG_A \ // b&c + XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d) + LEAL (REG_E)(R12*1), REG_E // E += A >>> 5 + + +// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX +#define CALC_0 \ + MOVL SI, BX \ // Precalculating first round + RORXL $2, SI, SI \ + ANDNL AX, BX, BP \ + ANDL DI, BX \ + XORL BP, BX \ + CALC_F1_PRE(0x0,CX,BX,DI,DX) \ + PRECALC_0(0x80) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_1 \ + CALC_F1_PRE(0x4,DX,CX,SI,AX) \ + PRECALC_1(0x80) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_2 \ + CALC_F1_PRE(0x8,AX,DX,BX,DI) \ + PRECALC_2(Y15) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_3 \ + CALC_F1_PRE(0xc,DI,AX,CX,SI) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_4 \ + CALC_F1_PRE(0x20,SI,DI,DX,BX) \ + PRECALC_4(Y15,0x0) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_5 \ + CALC_F1_PRE(0x24,BX,SI,AX,CX) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_6 \ + CALC_F1_PRE(0x28,CX,BX,DI,DX) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_7 \ + CALC_F1_PRE(0x2c,DX,CX,SI,AX) \ + PRECALC_7(0x0) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_8 \ + CALC_F1_PRE(0x40,AX,DX,BX,DI) \ + PRECALC_0(0x90) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_9 \ + CALC_F1_PRE(0x44,DI,AX,CX,SI) \ + PRECALC_1(0x90) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_10 \ + CALC_F1_PRE(0x48,SI,DI,DX,BX) \ + PRECALC_2(Y14) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_11 \ + CALC_F1_PRE(0x4c,BX,SI,AX,CX) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_12 \ + CALC_F1_PRE(0x60,CX,BX,DI,DX) \ + PRECALC_4(Y14,0x0) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_13 \ + CALC_F1_PRE(0x64,DX,CX,SI,AX) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_14 \ + CALC_F1_PRE(0x68,AX,DX,BX,DI) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_15 \ + CALC_F1_PRE(0x6c,DI,AX,CX,SI) \ + PRECALC_7(0x10) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_16 \ + CALC_F1_PRE(0x80,SI,DI,DX,BX) \ + PRECALC_0(0xa0) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_17 \ + CALC_F1_PRE(0x84,BX,SI,AX,CX) \ + PRECALC_1(0xa0) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_18 \ + CALC_F1_PRE(0x88,CX,BX,DI,DX) \ + PRECALC_2(Y13) \ + CALC_F1_POST(CX,SI,DX) + + +#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \ + ADDL OFFSET(R15),REG_E \ + LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round + RORXL $0x1b, REG_A, R12 \ + RORXL $2, REG_A, REG_B // for next round + +#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \ + XORL REG_B, REG_A \ + ADDL R12, REG_E \ + XORL REG_C, REG_A + +#define CALC_19 \ + CALC_F2_PRE(0x8c,DX,CX,AX) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_20 \ + CALC_F2_PRE(0xa0,AX,DX,DI) \ + PRECALC_4(Y13,0x0) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_21 \ + CALC_F2_PRE(0xa4,DI,AX,SI) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_22 \ + CALC_F2_PRE(0xa8,SI,DI,BX) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_23 \ + CALC_F2_PRE(0xac,BX,SI,CX) \ + PRECALC_7(0x20) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_24 \ + CALC_F2_PRE(0xc0,CX,BX,DX) \ + PRECALC_0(0xb0) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_25 \ + CALC_F2_PRE(0xc4,DX,CX,AX) \ + PRECALC_1(0xb0) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_26 \ + CALC_F2_PRE(0xc8,AX,DX,DI) \ + PRECALC_2(Y12) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_27 \ + CALC_F2_PRE(0xcc,DI,AX,SI) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_28 \ + CALC_F2_PRE(0xe0,SI,DI,BX) \ + PRECALC_4(Y12,0x0) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_29 \ + CALC_F2_PRE(0xe4,BX,SI,CX) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_30 \ + CALC_F2_PRE(0xe8,CX,BX,DX) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_31 \ + CALC_F2_PRE(0xec,DX,CX,AX) \ + PRECALC_7(0x30) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_32 \ + CALC_F2_PRE(0x100,AX,DX,DI) \ + PRECALC_16(Y15,Y14,Y12,Y8) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_33 \ + CALC_F2_PRE(0x104,DI,AX,SI) \ + PRECALC_17(Y15,Y13,Y8) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_34 \ + CALC_F2_PRE(0x108,SI,DI,BX) \ + PRECALC_18(Y8) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_35 \ + CALC_F2_PRE(0x10c,BX,SI,CX) \ + PRECALC_19(Y8) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_36 \ + CALC_F2_PRE(0x120,CX,BX,DX) \ + PRECALC_20(Y8) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_37 \ + CALC_F2_PRE(0x124,DX,CX,AX) \ + PRECALC_21(Y8) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_38 \ + CALC_F2_PRE(0x128,AX,DX,DI) \ + CALC_F2_POST(AX,CX,BX,DI) + + +#define CALC_F3_PRE(OFFSET,REG_E) \ + ADDL OFFSET(R15),REG_E + +#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \ + LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round + MOVL REG_B, BP \ + ORL REG_A, BP \ + RORXL $0x1b, REG_A, R12 \ + RORXL $2, REG_A, REG_TB \ + ANDL REG_C, BP \ // Calculate F for the next round + ANDL REG_B, REG_A \ + ORL BP, REG_A \ + ADDL R12, REG_E + +#define CALC_39 \ + CALC_F3_PRE(0x12c,SI) \ + PRECALC_23(Y8,0x0,0x80) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_40 \ + CALC_F3_PRE(0x140,BX) \ + PRECALC_16(Y14,Y13,Y8,Y7) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_41 \ + CALC_F3_PRE(0x144,CX) \ + PRECALC_17(Y14,Y12,Y7) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_42 \ + CALC_F3_PRE(0x148,DX) \ + PRECALC_18(Y7) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_43 \ + CALC_F3_PRE(0x14c,AX) \ + PRECALC_19(Y7) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_44 \ + CALC_F3_PRE(0x160,DI) \ + PRECALC_20(Y7) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_45 \ + CALC_F3_PRE(0x164,SI) \ + PRECALC_21(Y7) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_46 \ + CALC_F3_PRE(0x168,BX) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_47 \ + CALC_F3_PRE(0x16c,CX) \ + VPXOR Y9, Y0, Y7 \ + VPADDD 0x20(R8), Y7, Y0 \ + VMOVDQU Y0, 0xa0(R14) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_48 \ + CALC_F3_PRE(0x180,DX) \ + PRECALC_16(Y13,Y12,Y7,Y5) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_49 \ + CALC_F3_PRE(0x184,AX) \ + PRECALC_17(Y13,Y8,Y5) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_50 \ + CALC_F3_PRE(0x188,DI) \ + PRECALC_18(Y5) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_51 \ + CALC_F3_PRE(0x18c,SI) \ + PRECALC_19(Y5) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_52 \ + CALC_F3_PRE(0x1a0,BX) \ + PRECALC_20(Y5) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_53 \ + CALC_F3_PRE(0x1a4,CX) \ + PRECALC_21(Y5) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_54 \ + CALC_F3_PRE(0x1a8,DX) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_55 \ + CALC_F3_PRE(0x1ac,AX) \ + PRECALC_23(Y5,0x20,0xc0) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_56 \ + CALC_F3_PRE(0x1c0,DI) \ + PRECALC_16(Y12,Y8,Y5,Y3) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_57 \ + CALC_F3_PRE(0x1c4,SI) \ + PRECALC_17(Y12,Y7,Y3) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_58 \ + CALC_F3_PRE(0x1c8,BX) \ + PRECALC_18(Y3) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_59 \ + CALC_F2_PRE(0x1cc,BX,SI,CX) \ + PRECALC_19(Y3) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_60 \ + CALC_F2_PRE(0x1e0,CX,BX,DX) \ + PRECALC_20(Y3) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_61 \ + CALC_F2_PRE(0x1e4,DX,CX,AX) \ + PRECALC_21(Y3) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_62 \ + CALC_F2_PRE(0x1e8,AX,DX,DI) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_63 \ + CALC_F2_PRE(0x1ec,DI,AX,SI) \ + PRECALC_23(Y3,0x20,0xe0) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_64 \ + CALC_F2_PRE(0x200,SI,DI,BX) \ + PRECALC_32(Y5,Y3) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_65 \ + CALC_F2_PRE(0x204,BX,SI,CX) \ + PRECALC_33(Y14,Y15) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_66 \ + CALC_F2_PRE(0x208,CX,BX,DX) \ + PRECALC_34(Y8) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_67 \ + CALC_F2_PRE(0x20c,DX,CX,AX) \ + PRECALC_35(Y15) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_68 \ + CALC_F2_PRE(0x220,AX,DX,DI) \ + PRECALC_36(Y15) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_69 \ + CALC_F2_PRE(0x224,DI,AX,SI) \ + PRECALC_37(Y15) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_70 \ + CALC_F2_PRE(0x228,SI,DI,BX) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_71 \ + CALC_F2_PRE(0x22c,BX,SI,CX) \ + PRECALC_39(Y15,0x20,0x100) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_72 \ + CALC_F2_PRE(0x240,CX,BX,DX) \ + PRECALC_32(Y3,Y15) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_73 \ + CALC_F2_PRE(0x244,DX,CX,AX) \ + PRECALC_33(Y13,Y14) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_74 \ + CALC_F2_PRE(0x248,AX,DX,DI) \ + PRECALC_34(Y7) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_75 \ + CALC_F2_PRE(0x24c,DI,AX,SI) \ + PRECALC_35(Y14) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_76 \ + CALC_F2_PRE(0x260,SI,DI,BX) \ + PRECALC_36(Y14) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_77 \ + CALC_F2_PRE(0x264,BX,SI,CX) \ + PRECALC_37(Y14) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_78 \ + CALC_F2_PRE(0x268,CX,BX,DX) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_79 \ + ADDL 0x26c(R15), AX \ + LEAL (AX)(CX*1), AX \ + RORXL $0x1b, DX, R12 \ + PRECALC_39(Y14,0x20,0x120) \ + ADDL R12, AX + +// Similar to CALC_0 +#define CALC_80 \ + MOVL CX, DX \ + RORXL $2, CX, CX \ + ANDNL SI, DX, BP \ + ANDL BX, DX \ + XORL BP, DX \ + CALC_F1_PRE(0x10,AX,DX,BX,DI) \ + PRECALC_32(Y15,Y14) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_81 \ + CALC_F1_PRE(0x14,DI,AX,CX,SI) \ + PRECALC_33(Y12,Y13) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_82 \ + CALC_F1_PRE(0x18,SI,DI,DX,BX) \ + PRECALC_34(Y5) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_83 \ + CALC_F1_PRE(0x1c,BX,SI,AX,CX) \ + PRECALC_35(Y13) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_84 \ + CALC_F1_PRE(0x30,CX,BX,DI,DX) \ + PRECALC_36(Y13) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_85 \ + CALC_F1_PRE(0x34,DX,CX,SI,AX) \ + PRECALC_37(Y13) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_86 \ + CALC_F1_PRE(0x38,AX,DX,BX,DI) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_87 \ + CALC_F1_PRE(0x3c,DI,AX,CX,SI) \ + PRECALC_39(Y13,0x40,0x140) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_88 \ + CALC_F1_PRE(0x50,SI,DI,DX,BX) \ + PRECALC_32(Y14,Y13) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_89 \ + CALC_F1_PRE(0x54,BX,SI,AX,CX) \ + PRECALC_33(Y8,Y12) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_90 \ + CALC_F1_PRE(0x58,CX,BX,DI,DX) \ + PRECALC_34(Y3) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_91 \ + CALC_F1_PRE(0x5c,DX,CX,SI,AX) \ + PRECALC_35(Y12) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_92 \ + CALC_F1_PRE(0x70,AX,DX,BX,DI) \ + PRECALC_36(Y12) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_93 \ + CALC_F1_PRE(0x74,DI,AX,CX,SI) \ + PRECALC_37(Y12) \ + CALC_F1_POST(DI,DX,SI) + +#define CALC_94 \ + CALC_F1_PRE(0x78,SI,DI,DX,BX) \ + CALC_F1_POST(SI,AX,BX) + +#define CALC_95 \ + CALC_F1_PRE(0x7c,BX,SI,AX,CX) \ + PRECALC_39(Y12,0x40,0x160) \ + CALC_F1_POST(BX,DI,CX) + +#define CALC_96 \ + CALC_F1_PRE(0x90,CX,BX,DI,DX) \ + PRECALC_32(Y13,Y12) \ + CALC_F1_POST(CX,SI,DX) + +#define CALC_97 \ + CALC_F1_PRE(0x94,DX,CX,SI,AX) \ + PRECALC_33(Y7,Y8) \ + CALC_F1_POST(DX,BX,AX) + +#define CALC_98 \ + CALC_F1_PRE(0x98,AX,DX,BX,DI) \ + PRECALC_34(Y15) \ + CALC_F1_POST(AX,CX,DI) + +#define CALC_99 \ + CALC_F2_PRE(0x9c,DI,AX,SI) \ + PRECALC_35(Y8) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_100 \ + CALC_F2_PRE(0xb0,SI,DI,BX) \ + PRECALC_36(Y8) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_101 \ + CALC_F2_PRE(0xb4,BX,SI,CX) \ + PRECALC_37(Y8) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_102 \ + CALC_F2_PRE(0xb8,CX,BX,DX) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_103 \ + CALC_F2_PRE(0xbc,DX,CX,AX) \ + PRECALC_39(Y8,0x40,0x180) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_104 \ + CALC_F2_PRE(0xd0,AX,DX,DI) \ + PRECALC_32(Y12,Y8) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_105 \ + CALC_F2_PRE(0xd4,DI,AX,SI) \ + PRECALC_33(Y5,Y7) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_106 \ + CALC_F2_PRE(0xd8,SI,DI,BX) \ + PRECALC_34(Y14) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_107 \ + CALC_F2_PRE(0xdc,BX,SI,CX) \ + PRECALC_35(Y7) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_108 \ + CALC_F2_PRE(0xf0,CX,BX,DX) \ + PRECALC_36(Y7) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_109 \ + CALC_F2_PRE(0xf4,DX,CX,AX) \ + PRECALC_37(Y7) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_110 \ + CALC_F2_PRE(0xf8,AX,DX,DI) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_111 \ + CALC_F2_PRE(0xfc,DI,AX,SI) \ + PRECALC_39(Y7,0x40,0x1a0) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_112 \ + CALC_F2_PRE(0x110,SI,DI,BX) \ + PRECALC_32(Y8,Y7) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_113 \ + CALC_F2_PRE(0x114,BX,SI,CX) \ + PRECALC_33(Y3,Y5) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_114 \ + CALC_F2_PRE(0x118,CX,BX,DX) \ + PRECALC_34(Y13) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_115 \ + CALC_F2_PRE(0x11c,DX,CX,AX) \ + PRECALC_35(Y5) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_116 \ + CALC_F2_PRE(0x130,AX,DX,DI) \ + PRECALC_36(Y5) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_117 \ + CALC_F2_PRE(0x134,DI,AX,SI) \ + PRECALC_37(Y5) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_118 \ + CALC_F2_PRE(0x138,SI,DI,BX) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_119 \ + CALC_F3_PRE(0x13c,CX) \ + PRECALC_39(Y5,0x40,0x1c0) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_120 \ + CALC_F3_PRE(0x150,DX) \ + PRECALC_32(Y7,Y5) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_121 \ + CALC_F3_PRE(0x154,AX) \ + PRECALC_33(Y15,Y3) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_122 \ + CALC_F3_PRE(0x158,DI) \ + PRECALC_34(Y12) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_123 \ + CALC_F3_PRE(0x15c,SI) \ + PRECALC_35(Y3) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_124 \ + CALC_F3_PRE(0x170,BX) \ + PRECALC_36(Y3) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_125 \ + CALC_F3_PRE(0x174,CX) \ + PRECALC_37(Y3) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_126 \ + CALC_F3_PRE(0x178,DX) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_127 \ + CALC_F3_PRE(0x17c,AX) \ + PRECALC_39(Y3,0x60,0x1e0) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_128 \ + CALC_F3_PRE(0x190,DI) \ + PRECALC_32(Y5,Y3) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_129 \ + CALC_F3_PRE(0x194,SI) \ + PRECALC_33(Y14,Y15) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_130 \ + CALC_F3_PRE(0x198,BX) \ + PRECALC_34(Y8) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_131 \ + CALC_F3_PRE(0x19c,CX) \ + PRECALC_35(Y15) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_132 \ + CALC_F3_PRE(0x1b0,DX) \ + PRECALC_36(Y15) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_133 \ + CALC_F3_PRE(0x1b4,AX) \ + PRECALC_37(Y15) \ + CALC_F3_POST(DX,BX,SI,AX,CX) + +#define CALC_134 \ + CALC_F3_PRE(0x1b8,DI) \ + CALC_F3_POST(AX,CX,BX,DI,DX) + +#define CALC_135 \ + CALC_F3_PRE(0x1bc,SI) \ + PRECALC_39(Y15,0x60,0x200) \ + CALC_F3_POST(DI,DX,CX,SI,AX) + +#define CALC_136 \ + CALC_F3_PRE(0x1d0,BX) \ + PRECALC_32(Y3,Y15) \ + CALC_F3_POST(SI,AX,DX,BX,DI) + +#define CALC_137 \ + CALC_F3_PRE(0x1d4,CX) \ + PRECALC_33(Y13,Y14) \ + CALC_F3_POST(BX,DI,AX,CX,SI) + +#define CALC_138 \ + CALC_F3_PRE(0x1d8,DX) \ + PRECALC_34(Y7) \ + CALC_F3_POST(CX,SI,DI,DX,BX) + +#define CALC_139 \ + CALC_F2_PRE(0x1dc,DX,CX,AX) \ + PRECALC_35(Y14) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_140 \ + CALC_F2_PRE(0x1f0,AX,DX,DI) \ + PRECALC_36(Y14) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_141 \ + CALC_F2_PRE(0x1f4,DI,AX,SI) \ + PRECALC_37(Y14) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_142 \ + CALC_F2_PRE(0x1f8,SI,DI,BX) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_143 \ + CALC_F2_PRE(0x1fc,BX,SI,CX) \ + PRECALC_39(Y14,0x60,0x220) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_144 \ + CALC_F2_PRE(0x210,CX,BX,DX) \ + PRECALC_32(Y15,Y14) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_145 \ + CALC_F2_PRE(0x214,DX,CX,AX) \ + PRECALC_33(Y12,Y13) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_146 \ + CALC_F2_PRE(0x218,AX,DX,DI) \ + PRECALC_34(Y5) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_147 \ + CALC_F2_PRE(0x21c,DI,AX,SI) \ + PRECALC_35(Y13) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_148 \ + CALC_F2_PRE(0x230,SI,DI,BX) \ + PRECALC_36(Y13) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_149 \ + CALC_F2_PRE(0x234,BX,SI,CX) \ + PRECALC_37(Y13) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_150 \ + CALC_F2_PRE(0x238,CX,BX,DX) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_151 \ + CALC_F2_PRE(0x23c,DX,CX,AX) \ + PRECALC_39(Y13,0x60,0x240) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_152 \ + CALC_F2_PRE(0x250,AX,DX,DI) \ + PRECALC_32(Y14,Y13) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_153 \ + CALC_F2_PRE(0x254,DI,AX,SI) \ + PRECALC_33(Y8,Y12) \ + CALC_F2_POST(DI,DX,CX,SI) + +#define CALC_154 \ + CALC_F2_PRE(0x258,SI,DI,BX) \ + PRECALC_34(Y3) \ + CALC_F2_POST(SI,AX,DX,BX) + +#define CALC_155 \ + CALC_F2_PRE(0x25c,BX,SI,CX) \ + PRECALC_35(Y12) \ + CALC_F2_POST(BX,DI,AX,CX) + +#define CALC_156 \ + CALC_F2_PRE(0x270,CX,BX,DX) \ + PRECALC_36(Y12) \ + CALC_F2_POST(CX,SI,DI,DX) + +#define CALC_157 \ + CALC_F2_PRE(0x274,DX,CX,AX) \ + PRECALC_37(Y12) \ + CALC_F2_POST(DX,BX,SI,AX) + +#define CALC_158 \ + CALC_F2_PRE(0x278,AX,DX,DI) \ + CALC_F2_POST(AX,CX,BX,DI) + +#define CALC_159 \ + ADDL 0x27c(R15),SI \ + LEAL (SI)(AX*1), SI \ + RORXL $0x1b, DI, R12 \ + PRECALC_39(Y12,0x60,0x260) \ + ADDL R12, SI + + + +#define CALC \ + MOVL (R9), CX \ + MOVL 4(R9), SI \ + MOVL 8(R9), DI \ + MOVL 12(R9), AX \ + MOVL 16(R9), DX \ + MOVQ SP, R14 \ + LEAQ (2*4*80+32)(SP), R15 \ + PRECALC \ // Precalc WK for first 2 blocks + XCHGQ R15, R14 \ +loop: \ // this loops is unrolled + CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block + JNE begin \ + VZEROUPPER \ + RET \ +begin: \ + CALC_0 \ + CALC_1 \ + CALC_2 \ + CALC_3 \ + CALC_4 \ + CALC_5 \ + CALC_6 \ + CALC_7 \ + CALC_8 \ + CALC_9 \ + CALC_10 \ + CALC_11 \ + CALC_12 \ + CALC_13 \ + CALC_14 \ + CALC_15 \ + CALC_16 \ + CALC_17 \ + CALC_18 \ + CALC_19 \ + CALC_20 \ + CALC_21 \ + CALC_22 \ + CALC_23 \ + CALC_24 \ + CALC_25 \ + CALC_26 \ + CALC_27 \ + CALC_28 \ + CALC_29 \ + CALC_30 \ + CALC_31 \ + CALC_32 \ + CALC_33 \ + CALC_34 \ + CALC_35 \ + CALC_36 \ + CALC_37 \ + CALC_38 \ + CALC_39 \ + CALC_40 \ + CALC_41 \ + CALC_42 \ + CALC_43 \ + CALC_44 \ + CALC_45 \ + CALC_46 \ + CALC_47 \ + CALC_48 \ + CALC_49 \ + CALC_50 \ + CALC_51 \ + CALC_52 \ + CALC_53 \ + CALC_54 \ + CALC_55 \ + CALC_56 \ + CALC_57 \ + CALC_58 \ + CALC_59 \ + ADDQ $128, R10 \ // move to next even-64-byte block + CMPQ R10, R11 \ // is current block the last one? + CMOVQCC R8, R10 \ // signal the last iteration smartly + CALC_60 \ + CALC_61 \ + CALC_62 \ + CALC_63 \ + CALC_64 \ + CALC_65 \ + CALC_66 \ + CALC_67 \ + CALC_68 \ + CALC_69 \ + CALC_70 \ + CALC_71 \ + CALC_72 \ + CALC_73 \ + CALC_74 \ + CALC_75 \ + CALC_76 \ + CALC_77 \ + CALC_78 \ + CALC_79 \ + UPDATE_HASH(AX,DX,BX,SI,DI) \ + CMPQ R10, R8 \ // is current block the last one? + JE loop\ + MOVL DX, CX \ + CALC_80 \ + CALC_81 \ + CALC_82 \ + CALC_83 \ + CALC_84 \ + CALC_85 \ + CALC_86 \ + CALC_87 \ + CALC_88 \ + CALC_89 \ + CALC_90 \ + CALC_91 \ + CALC_92 \ + CALC_93 \ + CALC_94 \ + CALC_95 \ + CALC_96 \ + CALC_97 \ + CALC_98 \ + CALC_99 \ + CALC_100 \ + CALC_101 \ + CALC_102 \ + CALC_103 \ + CALC_104 \ + CALC_105 \ + CALC_106 \ + CALC_107 \ + CALC_108 \ + CALC_109 \ + CALC_110 \ + CALC_111 \ + CALC_112 \ + CALC_113 \ + CALC_114 \ + CALC_115 \ + CALC_116 \ + CALC_117 \ + CALC_118 \ + CALC_119 \ + CALC_120 \ + CALC_121 \ + CALC_122 \ + CALC_123 \ + CALC_124 \ + CALC_125 \ + CALC_126 \ + CALC_127 \ + CALC_128 \ + CALC_129 \ + CALC_130 \ + CALC_131 \ + CALC_132 \ + CALC_133 \ + CALC_134 \ + CALC_135 \ + CALC_136 \ + CALC_137 \ + CALC_138 \ + CALC_139 \ + ADDQ $128, R13 \ //move to next even-64-byte block + CMPQ R13, R11 \ //is current block the last one? + CMOVQCC R8, R10 \ + CALC_140 \ + CALC_141 \ + CALC_142 \ + CALC_143 \ + CALC_144 \ + CALC_145 \ + CALC_146 \ + CALC_147 \ + CALC_148 \ + CALC_149 \ + CALC_150 \ + CALC_151 \ + CALC_152 \ + CALC_153 \ + CALC_154 \ + CALC_155 \ + CALC_156 \ + CALC_157 \ + CALC_158 \ + CALC_159 \ + UPDATE_HASH(SI,DI,DX,CX,BX) \ + MOVL SI, R12 \ //Reset state for AVX2 reg permutation + MOVL DI, SI \ + MOVL DX, DI \ + MOVL BX, DX \ + MOVL CX, AX \ + MOVL R12, CX \ + XCHGQ R15, R14 \ + JMP loop + + + +TEXT ·blockAVX2(SB),$1408-32 + + MOVQ dig+0(FP), DI + MOVQ p_base+8(FP), SI + MOVQ p_len+16(FP), DX + SHRQ $6, DX + SHLQ $6, DX + + MOVQ $K_XMM_AR<>(SB), R8 + + MOVQ DI, R9 + MOVQ SI, R10 + LEAQ 64(SI), R13 + + ADDQ SI, DX + ADDQ $64, DX + MOVQ DX, R11 + + CMPQ R13, R11 + CMOVQCC R8, R13 + + VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10 + + CALC // RET is inside macros + +DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999 +DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1 +DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc +DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6 +DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6 +GLOBL K_XMM_AR<>(SB),RODATA,$128 + +DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203 +DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607 +DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b +DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f +DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203 +DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607 +DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b +DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f +GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32 diff --git a/src/crypto/sha1/sha1block_arm.s b/src/crypto/sha1/sha1block_arm.s new file mode 100644 index 0000000..2236533 --- /dev/null +++ b/src/crypto/sha1/sha1block_arm.s @@ -0,0 +1,217 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// ARM version of md5block.go + +#include "textflag.h" + +// SHA-1 block routine. See sha1block.go for Go equivalent. +// +// There are 80 rounds of 4 types: +// - rounds 0-15 are type 1 and load data (ROUND1 macro). +// - rounds 16-19 are type 1 and do not load data (ROUND1x macro). +// - rounds 20-39 are type 2 and do not load data (ROUND2 macro). +// - rounds 40-59 are type 3 and do not load data (ROUND3 macro). +// - rounds 60-79 are type 4 and do not load data (ROUND4 macro). +// +// Each round loads or shuffles the data, then computes a per-round +// function of b, c, d, and then mixes the result into and rotates the +// five registers a, b, c, d, e holding the intermediate results. +// +// The register rotation is implemented by rotating the arguments to +// the round macros instead of by explicit move instructions. + +// Register definitions +#define Rdata R0 // Pointer to incoming data +#define Rconst R1 // Current constant for SHA round +#define Ra R2 // SHA-1 accumulator +#define Rb R3 // SHA-1 accumulator +#define Rc R4 // SHA-1 accumulator +#define Rd R5 // SHA-1 accumulator +#define Re R6 // SHA-1 accumulator +#define Rt0 R7 // Temporary +#define Rt1 R8 // Temporary +// r9, r10 are forbidden +// r11 is OK provided you check the assembler that no synthetic instructions use it +#define Rt2 R11 // Temporary +#define Rctr R12 // loop counter +#define Rw R14 // point to w buffer + +// func block(dig *digest, p []byte) +// 0(FP) is *digest +// 4(FP) is p.array (struct Slice) +// 8(FP) is p.len +//12(FP) is p.cap +// +// Stack frame +#define p_end end-4(SP) // pointer to the end of data +#define p_data data-8(SP) // current data pointer (unused?) +#define w_buf buf-(8+4*80)(SP) //80 words temporary buffer w uint32[80] +#define saved abcde-(8+4*80+4*5)(SP) // saved sha1 registers a,b,c,d,e - these must be last (unused?) +// Total size +4 for saved LR is 352 + + // w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3] + // e += w[i] +#define LOAD(Re) \ + MOVBU 2(Rdata), Rt0 ; \ + MOVBU 3(Rdata), Rt1 ; \ + MOVBU 1(Rdata), Rt2 ; \ + ORR Rt0<<8, Rt1, Rt0 ; \ + MOVBU.P 4(Rdata), Rt1 ; \ + ORR Rt2<<16, Rt0, Rt0 ; \ + ORR Rt1<<24, Rt0, Rt0 ; \ + MOVW.P Rt0, 4(Rw) ; \ + ADD Rt0, Re, Re + + // tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf] + // w[i&0xf] = tmp<<1 | tmp>>(32-1) + // e += w[i&0xf] +#define SHUFFLE(Re) \ + MOVW (-16*4)(Rw), Rt0 ; \ + MOVW (-14*4)(Rw), Rt1 ; \ + MOVW (-8*4)(Rw), Rt2 ; \ + EOR Rt0, Rt1, Rt0 ; \ + MOVW (-3*4)(Rw), Rt1 ; \ + EOR Rt2, Rt0, Rt0 ; \ + EOR Rt0, Rt1, Rt0 ; \ + MOVW Rt0@>(32-1), Rt0 ; \ + MOVW.P Rt0, 4(Rw) ; \ + ADD Rt0, Re, Re + + // t1 = (b & c) | ((~b) & d) +#define FUNC1(Ra, Rb, Rc, Rd, Re) \ + MVN Rb, Rt1 ; \ + AND Rb, Rc, Rt0 ; \ + AND Rd, Rt1, Rt1 ; \ + ORR Rt0, Rt1, Rt1 + + // t1 = b ^ c ^ d +#define FUNC2(Ra, Rb, Rc, Rd, Re) \ + EOR Rb, Rc, Rt1 ; \ + EOR Rd, Rt1, Rt1 + + // t1 = (b & c) | (b & d) | (c & d) = + // t1 = (b & c) | ((b | c) & d) +#define FUNC3(Ra, Rb, Rc, Rd, Re) \ + ORR Rb, Rc, Rt0 ; \ + AND Rb, Rc, Rt1 ; \ + AND Rd, Rt0, Rt0 ; \ + ORR Rt0, Rt1, Rt1 + +#define FUNC4 FUNC2 + + // a5 := a<<5 | a>>(32-5) + // b = b<<30 | b>>(32-30) + // e = a5 + t1 + e + const +#define MIX(Ra, Rb, Rc, Rd, Re) \ + ADD Rt1, Re, Re ; \ + MOVW Rb@>(32-30), Rb ; \ + ADD Ra@>(32-5), Re, Re ; \ + ADD Rconst, Re, Re + +#define ROUND1(Ra, Rb, Rc, Rd, Re) \ + LOAD(Re) ; \ + FUNC1(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) + +#define ROUND1x(Ra, Rb, Rc, Rd, Re) \ + SHUFFLE(Re) ; \ + FUNC1(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) + +#define ROUND2(Ra, Rb, Rc, Rd, Re) \ + SHUFFLE(Re) ; \ + FUNC2(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) + +#define ROUND3(Ra, Rb, Rc, Rd, Re) \ + SHUFFLE(Re) ; \ + FUNC3(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) + +#define ROUND4(Ra, Rb, Rc, Rd, Re) \ + SHUFFLE(Re) ; \ + FUNC4(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) + + +// func block(dig *digest, p []byte) +TEXT ·block(SB), 0, $352-16 + MOVW p+4(FP), Rdata // pointer to the data + MOVW p_len+8(FP), Rt0 // number of bytes + ADD Rdata, Rt0 + MOVW Rt0, p_end // pointer to end of data + + // Load up initial SHA-1 accumulator + MOVW dig+0(FP), Rt0 + MOVM.IA (Rt0), [Ra,Rb,Rc,Rd,Re] + +loop: + // Save registers at SP+4 onwards + MOVM.IB [Ra,Rb,Rc,Rd,Re], (R13) + + MOVW $w_buf, Rw + MOVW $0x5A827999, Rconst + MOVW $3, Rctr +loop1: ROUND1(Ra, Rb, Rc, Rd, Re) + ROUND1(Re, Ra, Rb, Rc, Rd) + ROUND1(Rd, Re, Ra, Rb, Rc) + ROUND1(Rc, Rd, Re, Ra, Rb) + ROUND1(Rb, Rc, Rd, Re, Ra) + SUB.S $1, Rctr + BNE loop1 + + ROUND1(Ra, Rb, Rc, Rd, Re) + ROUND1x(Re, Ra, Rb, Rc, Rd) + ROUND1x(Rd, Re, Ra, Rb, Rc) + ROUND1x(Rc, Rd, Re, Ra, Rb) + ROUND1x(Rb, Rc, Rd, Re, Ra) + + MOVW $0x6ED9EBA1, Rconst + MOVW $4, Rctr +loop2: ROUND2(Ra, Rb, Rc, Rd, Re) + ROUND2(Re, Ra, Rb, Rc, Rd) + ROUND2(Rd, Re, Ra, Rb, Rc) + ROUND2(Rc, Rd, Re, Ra, Rb) + ROUND2(Rb, Rc, Rd, Re, Ra) + SUB.S $1, Rctr + BNE loop2 + + MOVW $0x8F1BBCDC, Rconst + MOVW $4, Rctr +loop3: ROUND3(Ra, Rb, Rc, Rd, Re) + ROUND3(Re, Ra, Rb, Rc, Rd) + ROUND3(Rd, Re, Ra, Rb, Rc) + ROUND3(Rc, Rd, Re, Ra, Rb) + ROUND3(Rb, Rc, Rd, Re, Ra) + SUB.S $1, Rctr + BNE loop3 + + MOVW $0xCA62C1D6, Rconst + MOVW $4, Rctr +loop4: ROUND4(Ra, Rb, Rc, Rd, Re) + ROUND4(Re, Ra, Rb, Rc, Rd) + ROUND4(Rd, Re, Ra, Rb, Rc) + ROUND4(Rc, Rd, Re, Ra, Rb) + ROUND4(Rb, Rc, Rd, Re, Ra) + SUB.S $1, Rctr + BNE loop4 + + // Accumulate - restoring registers from SP+4 + MOVM.IB (R13), [Rt0,Rt1,Rt2,Rctr,Rw] + ADD Rt0, Ra + ADD Rt1, Rb + ADD Rt2, Rc + ADD Rctr, Rd + ADD Rw, Re + + MOVW p_end, Rt0 + CMP Rt0, Rdata + BLO loop + + // Save final SHA-1 accumulator + MOVW dig+0(FP), Rt0 + MOVM.IA [Ra,Rb,Rc,Rd,Re], (Rt0) + + RET diff --git a/src/crypto/sha1/sha1block_arm64.go b/src/crypto/sha1/sha1block_arm64.go new file mode 100644 index 0000000..08d3df0 --- /dev/null +++ b/src/crypto/sha1/sha1block_arm64.go @@ -0,0 +1,26 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sha1 + +import "internal/cpu" + +var k = []uint32{ + 0x5A827999, + 0x6ED9EBA1, + 0x8F1BBCDC, + 0xCA62C1D6, +} + +//go:noescape +func sha1block(h []uint32, p []byte, k []uint32) + +func block(dig *digest, p []byte) { + if !cpu.ARM64.HasSHA1 { + blockGeneric(dig, p) + } else { + h := dig.h[:] + sha1block(h, p, k) + } +} diff --git a/src/crypto/sha1/sha1block_arm64.s b/src/crypto/sha1/sha1block_arm64.s new file mode 100644 index 0000000..d568384 --- /dev/null +++ b/src/crypto/sha1/sha1block_arm64.s @@ -0,0 +1,152 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +#define HASHUPDATECHOOSE \ + SHA1C V16.S4, V1, V2 \ + SHA1H V3, V1 \ + VMOV V2.B16, V3.B16 + +#define HASHUPDATEPARITY \ + SHA1P V16.S4, V1, V2 \ + SHA1H V3, V1 \ + VMOV V2.B16, V3.B16 + +#define HASHUPDATEMAJ \ + SHA1M V16.S4, V1, V2 \ + SHA1H V3, V1 \ + VMOV V2.B16, V3.B16 + +// func sha1block(h []uint32, p []byte, k []uint32) +TEXT ·sha1block(SB),NOSPLIT,$0 + MOVD h_base+0(FP), R0 // hash value first address + MOVD p_base+24(FP), R1 // message first address + MOVD k_base+48(FP), R2 // k constants first address + MOVD p_len+32(FP), R3 // message length + VLD1.P 16(R0), [V0.S4] + FMOVS (R0), F20 + SUB $16, R0, R0 + +blockloop: + + VLD1.P 16(R1), [V4.B16] // load message + VLD1.P 16(R1), [V5.B16] + VLD1.P 16(R1), [V6.B16] + VLD1.P 16(R1), [V7.B16] + VLD1 (R2), [V19.S4] // load constant k0-k79 + VMOV V0.B16, V2.B16 + VMOV V20.S[0], V1 + VMOV V2.B16, V3.B16 + VDUP V19.S[0], V17.S4 + VREV32 V4.B16, V4.B16 // prepare for using message in Byte format + VREV32 V5.B16, V5.B16 + VREV32 V6.B16, V6.B16 + VREV32 V7.B16, V7.B16 + + + VDUP V19.S[1], V18.S4 + VADD V17.S4, V4.S4, V16.S4 + SHA1SU0 V6.S4, V5.S4, V4.S4 + HASHUPDATECHOOSE + SHA1SU1 V7.S4, V4.S4 + + VADD V17.S4, V5.S4, V16.S4 + SHA1SU0 V7.S4, V6.S4, V5.S4 + HASHUPDATECHOOSE + SHA1SU1 V4.S4, V5.S4 + VADD V17.S4, V6.S4, V16.S4 + SHA1SU0 V4.S4, V7.S4, V6.S4 + HASHUPDATECHOOSE + SHA1SU1 V5.S4, V6.S4 + + VADD V17.S4, V7.S4, V16.S4 + SHA1SU0 V5.S4, V4.S4, V7.S4 + HASHUPDATECHOOSE + SHA1SU1 V6.S4, V7.S4 + + VADD V17.S4, V4.S4, V16.S4 + SHA1SU0 V6.S4, V5.S4, V4.S4 + HASHUPDATECHOOSE + SHA1SU1 V7.S4, V4.S4 + + VDUP V19.S[2], V17.S4 + VADD V18.S4, V5.S4, V16.S4 + SHA1SU0 V7.S4, V6.S4, V5.S4 + HASHUPDATEPARITY + SHA1SU1 V4.S4, V5.S4 + + VADD V18.S4, V6.S4, V16.S4 + SHA1SU0 V4.S4, V7.S4, V6.S4 + HASHUPDATEPARITY + SHA1SU1 V5.S4, V6.S4 + + VADD V18.S4, V7.S4, V16.S4 + SHA1SU0 V5.S4, V4.S4, V7.S4 + HASHUPDATEPARITY + SHA1SU1 V6.S4, V7.S4 + + VADD V18.S4, V4.S4, V16.S4 + SHA1SU0 V6.S4, V5.S4, V4.S4 + HASHUPDATEPARITY + SHA1SU1 V7.S4, V4.S4 + + VADD V18.S4, V5.S4, V16.S4 + SHA1SU0 V7.S4, V6.S4, V5.S4 + HASHUPDATEPARITY + SHA1SU1 V4.S4, V5.S4 + + VDUP V19.S[3], V18.S4 + VADD V17.S4, V6.S4, V16.S4 + SHA1SU0 V4.S4, V7.S4, V6.S4 + HASHUPDATEMAJ + SHA1SU1 V5.S4, V6.S4 + + VADD V17.S4, V7.S4, V16.S4 + SHA1SU0 V5.S4, V4.S4, V7.S4 + HASHUPDATEMAJ + SHA1SU1 V6.S4, V7.S4 + + VADD V17.S4, V4.S4, V16.S4 + SHA1SU0 V6.S4, V5.S4, V4.S4 + HASHUPDATEMAJ + SHA1SU1 V7.S4, V4.S4 + + VADD V17.S4, V5.S4, V16.S4 + SHA1SU0 V7.S4, V6.S4, V5.S4 + HASHUPDATEMAJ + SHA1SU1 V4.S4, V5.S4 + + VADD V17.S4, V6.S4, V16.S4 + SHA1SU0 V4.S4, V7.S4, V6.S4 + HASHUPDATEMAJ + SHA1SU1 V5.S4, V6.S4 + + VADD V18.S4, V7.S4, V16.S4 + SHA1SU0 V5.S4, V4.S4, V7.S4 + HASHUPDATEPARITY + SHA1SU1 V6.S4, V7.S4 + + VADD V18.S4, V4.S4, V16.S4 + HASHUPDATEPARITY + + VADD V18.S4, V5.S4, V16.S4 + HASHUPDATEPARITY + + VADD V18.S4, V6.S4, V16.S4 + HASHUPDATEPARITY + + VADD V18.S4, V7.S4, V16.S4 + HASHUPDATEPARITY + + SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes + VADD V2.S4, V0.S4, V0.S4 + VADD V1.S4, V20.S4, V20.S4 + CBNZ R3, blockloop + +sha1ret: + + VST1.P [V0.S4], 16(R0) // store hash value H(dcba) + FMOVS F20, (R0) // store hash value H(e) + RET diff --git a/src/crypto/sha1/sha1block_decl.go b/src/crypto/sha1/sha1block_decl.go new file mode 100644 index 0000000..518a4b6 --- /dev/null +++ b/src/crypto/sha1/sha1block_decl.go @@ -0,0 +1,11 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build arm || 386 || s390x + +package sha1 + +//go:noescape + +func block(dig *digest, p []byte) diff --git a/src/crypto/sha1/sha1block_generic.go b/src/crypto/sha1/sha1block_generic.go new file mode 100644 index 0000000..ba35155 --- /dev/null +++ b/src/crypto/sha1/sha1block_generic.go @@ -0,0 +1,11 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 && !386 && !arm && !s390x && !arm64 + +package sha1 + +func block(dig *digest, p []byte) { + blockGeneric(dig, p) +} diff --git a/src/crypto/sha1/sha1block_s390x.go b/src/crypto/sha1/sha1block_s390x.go new file mode 100644 index 0000000..446bf5d --- /dev/null +++ b/src/crypto/sha1/sha1block_s390x.go @@ -0,0 +1,9 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sha1 + +import "internal/cpu" + +var useAsm = cpu.S390X.HasSHA1 diff --git a/src/crypto/sha1/sha1block_s390x.s b/src/crypto/sha1/sha1block_s390x.s new file mode 100644 index 0000000..6ba6883 --- /dev/null +++ b/src/crypto/sha1/sha1block_s390x.s @@ -0,0 +1,20 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// func block(dig *digest, p []byte) +TEXT ·block(SB), NOSPLIT|NOFRAME, $0-32 + MOVBZ ·useAsm(SB), R4 + LMG dig+0(FP), R1, R3 // R2 = &p[0], R3 = len(p) + MOVBZ $1, R0 // SHA-1 function code + CMPBEQ R4, $0, generic + +loop: + WORD $0xB93E0002 // KIMD R2 + BVS loop // continue if interrupted + RET + +generic: + BR ·blockGeneric(SB) |