summaryrefslogtreecommitdiffstats
path: root/src/crypto/sha1
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 13:15:26 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-28 13:15:26 +0000
commit82539ad8d59729fb45b0bb0edda8f2bddb719eb1 (patch)
tree58f0b58e6f44f0e04d4a6373132cf426fa835fa7 /src/crypto/sha1
parentInitial commit. (diff)
downloadgolang-1.17-upstream.tar.xz
golang-1.17-upstream.zip
Adding upstream version 1.17.13.upstream/1.17.13upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crypto/sha1')
-rw-r--r--src/crypto/sha1/example_test.go42
-rw-r--r--src/crypto/sha1/fallback_test.go35
-rw-r--r--src/crypto/sha1/issue15617_test.go29
-rw-r--r--src/crypto/sha1/sha1.go266
-rw-r--r--src/crypto/sha1/sha1_test.go240
-rw-r--r--src/crypto/sha1/sha1block.go83
-rw-r--r--src/crypto/sha1/sha1block_386.s233
-rw-r--r--src/crypto/sha1/sha1block_amd64.go34
-rw-r--r--src/crypto/sha1/sha1block_amd64.s1500
-rw-r--r--src/crypto/sha1/sha1block_arm.s217
-rw-r--r--src/crypto/sha1/sha1block_arm64.go26
-rw-r--r--src/crypto/sha1/sha1block_arm64.s152
-rw-r--r--src/crypto/sha1/sha1block_decl.go12
-rw-r--r--src/crypto/sha1/sha1block_generic.go10
-rw-r--r--src/crypto/sha1/sha1block_s390x.go9
-rw-r--r--src/crypto/sha1/sha1block_s390x.s20
16 files changed, 2908 insertions, 0 deletions
diff --git a/src/crypto/sha1/example_test.go b/src/crypto/sha1/example_test.go
new file mode 100644
index 0000000..499055c
--- /dev/null
+++ b/src/crypto/sha1/example_test.go
@@ -0,0 +1,42 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1_test
+
+import (
+ "crypto/sha1"
+ "fmt"
+ "io"
+ "log"
+ "os"
+)
+
+func ExampleNew() {
+ h := sha1.New()
+ io.WriteString(h, "His money is twice tainted:")
+ io.WriteString(h, " 'taint yours and 'taint mine.")
+ fmt.Printf("% x", h.Sum(nil))
+ // Output: 59 7f 6a 54 00 10 f9 4c 15 d7 18 06 a9 9a 2c 87 10 e7 47 bd
+}
+
+func ExampleSum() {
+ data := []byte("This page intentionally left blank.")
+ fmt.Printf("% x", sha1.Sum(data))
+ // Output: af 06 49 23 bb f2 30 15 96 aa c4 c2 73 ba 32 17 8e bc 4a 96
+}
+
+func ExampleNew_file() {
+ f, err := os.Open("file.txt")
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+
+ h := sha1.New()
+ if _, err := io.Copy(h, f); err != nil {
+ log.Fatal(err)
+ }
+
+ fmt.Printf("% x", h.Sum(nil))
+}
diff --git a/src/crypto/sha1/fallback_test.go b/src/crypto/sha1/fallback_test.go
new file mode 100644
index 0000000..4bb8b33
--- /dev/null
+++ b/src/crypto/sha1/fallback_test.go
@@ -0,0 +1,35 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build s390x
+// +build s390x
+
+package sha1
+
+import (
+ "fmt"
+ "io"
+ "testing"
+)
+
+// Tests the fallback code path in case the optimized asm
+// implementation cannot be used.
+// See also TestBlockGeneric.
+func TestGenericPath(t *testing.T) {
+ if useAsm == false {
+ t.Skipf("assembly implementation unavailable")
+ }
+ useAsm = false
+ defer func() { useAsm = true }()
+ c := New()
+ in := "ΑΒΓΔΕϜΖΗΘΙΚΛΜΝΞΟΠϺϘΡΣΤΥΦΧΨΩ"
+ gold := "0f58c2bb130f8182375f325c18342215255387e5"
+ if _, err := io.WriteString(c, in); err != nil {
+ t.Fatalf("could not write to c: %v", err)
+ }
+ out := fmt.Sprintf("%x", c.Sum(nil))
+ if out != gold {
+ t.Fatalf("mismatch: got %s, wanted %s", out, gold)
+ }
+}
diff --git a/src/crypto/sha1/issue15617_test.go b/src/crypto/sha1/issue15617_test.go
new file mode 100644
index 0000000..436f78c
--- /dev/null
+++ b/src/crypto/sha1/issue15617_test.go
@@ -0,0 +1,29 @@
+//go:build amd64 && (linux || darwin)
+// +build amd64
+// +build linux darwin
+
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1_test
+
+import (
+ "crypto/sha1"
+ "syscall"
+ "testing"
+)
+
+func TestOutOfBoundsRead(t *testing.T) {
+ const pageSize = 4 << 10
+ data, err := syscall.Mmap(0, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANON|syscall.MAP_PRIVATE)
+ if err != nil {
+ panic(err)
+ }
+ if err := syscall.Mprotect(data[pageSize:], syscall.PROT_NONE); err != nil {
+ panic(err)
+ }
+ for i := 0; i < pageSize; i++ {
+ sha1.Sum(data[pageSize-i : pageSize])
+ }
+}
diff --git a/src/crypto/sha1/sha1.go b/src/crypto/sha1/sha1.go
new file mode 100644
index 0000000..286a59d
--- /dev/null
+++ b/src/crypto/sha1/sha1.go
@@ -0,0 +1,266 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package sha1 implements the SHA-1 hash algorithm as defined in RFC 3174.
+//
+// SHA-1 is cryptographically broken and should not be used for secure
+// applications.
+package sha1
+
+import (
+ "crypto"
+ "encoding/binary"
+ "errors"
+ "hash"
+)
+
+func init() {
+ crypto.RegisterHash(crypto.SHA1, New)
+}
+
+// The size of a SHA-1 checksum in bytes.
+const Size = 20
+
+// The blocksize of SHA-1 in bytes.
+const BlockSize = 64
+
+const (
+ chunk = 64
+ init0 = 0x67452301
+ init1 = 0xEFCDAB89
+ init2 = 0x98BADCFE
+ init3 = 0x10325476
+ init4 = 0xC3D2E1F0
+)
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+ h [5]uint32
+ x [chunk]byte
+ nx int
+ len uint64
+}
+
+const (
+ magic = "sha\x01"
+ marshaledSize = len(magic) + 5*4 + chunk + 8
+)
+
+func (d *digest) MarshalBinary() ([]byte, error) {
+ b := make([]byte, 0, marshaledSize)
+ b = append(b, magic...)
+ b = appendUint32(b, d.h[0])
+ b = appendUint32(b, d.h[1])
+ b = appendUint32(b, d.h[2])
+ b = appendUint32(b, d.h[3])
+ b = appendUint32(b, d.h[4])
+ b = append(b, d.x[:d.nx]...)
+ b = b[:len(b)+len(d.x)-int(d.nx)] // already zero
+ b = appendUint64(b, d.len)
+ return b, nil
+}
+
+func (d *digest) UnmarshalBinary(b []byte) error {
+ if len(b) < len(magic) || string(b[:len(magic)]) != magic {
+ return errors.New("crypto/sha1: invalid hash state identifier")
+ }
+ if len(b) != marshaledSize {
+ return errors.New("crypto/sha1: invalid hash state size")
+ }
+ b = b[len(magic):]
+ b, d.h[0] = consumeUint32(b)
+ b, d.h[1] = consumeUint32(b)
+ b, d.h[2] = consumeUint32(b)
+ b, d.h[3] = consumeUint32(b)
+ b, d.h[4] = consumeUint32(b)
+ b = b[copy(d.x[:], b):]
+ b, d.len = consumeUint64(b)
+ d.nx = int(d.len % chunk)
+ return nil
+}
+
+func appendUint64(b []byte, x uint64) []byte {
+ var a [8]byte
+ binary.BigEndian.PutUint64(a[:], x)
+ return append(b, a[:]...)
+}
+
+func appendUint32(b []byte, x uint32) []byte {
+ var a [4]byte
+ binary.BigEndian.PutUint32(a[:], x)
+ return append(b, a[:]...)
+}
+
+func consumeUint64(b []byte) ([]byte, uint64) {
+ _ = b[7]
+ x := uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 |
+ uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56
+ return b[8:], x
+}
+
+func consumeUint32(b []byte) ([]byte, uint32) {
+ _ = b[3]
+ x := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+ return b[4:], x
+}
+
+func (d *digest) Reset() {
+ d.h[0] = init0
+ d.h[1] = init1
+ d.h[2] = init2
+ d.h[3] = init3
+ d.h[4] = init4
+ d.nx = 0
+ d.len = 0
+}
+
+// New returns a new hash.Hash computing the SHA1 checksum. The Hash also
+// implements encoding.BinaryMarshaler and encoding.BinaryUnmarshaler to
+// marshal and unmarshal the internal state of the hash.
+func New() hash.Hash {
+ d := new(digest)
+ d.Reset()
+ return d
+}
+
+func (d *digest) Size() int { return Size }
+
+func (d *digest) BlockSize() int { return BlockSize }
+
+func (d *digest) Write(p []byte) (nn int, err error) {
+ nn = len(p)
+ d.len += uint64(nn)
+ if d.nx > 0 {
+ n := copy(d.x[d.nx:], p)
+ d.nx += n
+ if d.nx == chunk {
+ block(d, d.x[:])
+ d.nx = 0
+ }
+ p = p[n:]
+ }
+ if len(p) >= chunk {
+ n := len(p) &^ (chunk - 1)
+ block(d, p[:n])
+ p = p[n:]
+ }
+ if len(p) > 0 {
+ d.nx = copy(d.x[:], p)
+ }
+ return
+}
+
+func (d *digest) Sum(in []byte) []byte {
+ // Make a copy of d so that caller can keep writing and summing.
+ d0 := *d
+ hash := d0.checkSum()
+ return append(in, hash[:]...)
+}
+
+func (d *digest) checkSum() [Size]byte {
+ len := d.len
+ // Padding. Add a 1 bit and 0 bits until 56 bytes mod 64.
+ var tmp [64]byte
+ tmp[0] = 0x80
+ if len%64 < 56 {
+ d.Write(tmp[0 : 56-len%64])
+ } else {
+ d.Write(tmp[0 : 64+56-len%64])
+ }
+
+ // Length in bits.
+ len <<= 3
+ binary.BigEndian.PutUint64(tmp[:], len)
+ d.Write(tmp[0:8])
+
+ if d.nx != 0 {
+ panic("d.nx != 0")
+ }
+
+ var digest [Size]byte
+
+ binary.BigEndian.PutUint32(digest[0:], d.h[0])
+ binary.BigEndian.PutUint32(digest[4:], d.h[1])
+ binary.BigEndian.PutUint32(digest[8:], d.h[2])
+ binary.BigEndian.PutUint32(digest[12:], d.h[3])
+ binary.BigEndian.PutUint32(digest[16:], d.h[4])
+
+ return digest
+}
+
+// ConstantTimeSum computes the same result of Sum() but in constant time
+func (d *digest) ConstantTimeSum(in []byte) []byte {
+ d0 := *d
+ hash := d0.constSum()
+ return append(in, hash[:]...)
+}
+
+func (d *digest) constSum() [Size]byte {
+ var length [8]byte
+ l := d.len << 3
+ for i := uint(0); i < 8; i++ {
+ length[i] = byte(l >> (56 - 8*i))
+ }
+
+ nx := byte(d.nx)
+ t := nx - 56 // if nx < 56 then the MSB of t is one
+ mask1b := byte(int8(t) >> 7) // mask1b is 0xFF iff one block is enough
+
+ separator := byte(0x80) // gets reset to 0x00 once used
+ for i := byte(0); i < chunk; i++ {
+ mask := byte(int8(i-nx) >> 7) // 0x00 after the end of data
+
+ // if we reached the end of the data, replace with 0x80 or 0x00
+ d.x[i] = (^mask & separator) | (mask & d.x[i])
+
+ // zero the separator once used
+ separator &= mask
+
+ if i >= 56 {
+ // we might have to write the length here if all fit in one block
+ d.x[i] |= mask1b & length[i-56]
+ }
+ }
+
+ // compress, and only keep the digest if all fit in one block
+ block(d, d.x[:])
+
+ var digest [Size]byte
+ for i, s := range d.h {
+ digest[i*4] = mask1b & byte(s>>24)
+ digest[i*4+1] = mask1b & byte(s>>16)
+ digest[i*4+2] = mask1b & byte(s>>8)
+ digest[i*4+3] = mask1b & byte(s)
+ }
+
+ for i := byte(0); i < chunk; i++ {
+ // second block, it's always past the end of data, might start with 0x80
+ if i < 56 {
+ d.x[i] = separator
+ separator = 0
+ } else {
+ d.x[i] = length[i-56]
+ }
+ }
+
+ // compress, and only keep the digest if we actually needed the second block
+ block(d, d.x[:])
+
+ for i, s := range d.h {
+ digest[i*4] |= ^mask1b & byte(s>>24)
+ digest[i*4+1] |= ^mask1b & byte(s>>16)
+ digest[i*4+2] |= ^mask1b & byte(s>>8)
+ digest[i*4+3] |= ^mask1b & byte(s)
+ }
+
+ return digest
+}
+
+// Sum returns the SHA-1 checksum of the data.
+func Sum(data []byte) [Size]byte {
+ var d digest
+ d.Reset()
+ d.Write(data)
+ return d.checkSum()
+}
diff --git a/src/crypto/sha1/sha1_test.go b/src/crypto/sha1/sha1_test.go
new file mode 100644
index 0000000..c3e6010
--- /dev/null
+++ b/src/crypto/sha1/sha1_test.go
@@ -0,0 +1,240 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// SHA-1 hash algorithm. See RFC 3174.
+
+package sha1
+
+import (
+ "bytes"
+ "crypto/rand"
+ "encoding"
+ "fmt"
+ "hash"
+ "io"
+ "testing"
+)
+
+type sha1Test struct {
+ out string
+ in string
+ halfState string // marshaled hash state after first half of in written, used by TestGoldenMarshal
+}
+
+var golden = []sha1Test{
+ {"76245dbf96f661bd221046197ab8b9f063f11bad", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n", "sha\x01\v\xa0)I\xdeq(8h\x9ev\xe5\x88[\xf8\x81\x17\xba4Daaaaaaaaaaaaaaaaaaaaaa\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x96"},
+ {"da39a3ee5e6b4b0d3255bfef95601890afd80709", "", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+ {"86f7e437faa5a7fce15d1ddcb9eaeaea377667b8", "a", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+ {"da23614e02469a0d7c7bd1bdab5c9c474b1904dc", "ab", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"},
+ {"a9993e364706816aba3e25717850c26c9cd0d89d", "abc", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"},
+ {"81fe8bfe87576c3ecb22426f8e57847382917acf", "abcd", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0ab\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"},
+ {"03de6c570bfe24bfc328ccd7ca46b76eadaf4334", "abcde", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0ab\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"},
+ {"1f8ac10f23c5b5bc1167bda84b833e5c057a77d2", "abcdef", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"},
+ {"2fb5e13419fc89246865e7a324f476ec624e8740", "abcdefg", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"},
+ {"425af12a0743502b322e93a015bcf868e324d56a", "abcdefgh", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abcd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04"},
+ {"c63b19f1e4c8b5f76b25c49b8b87f57d8e4872a1", "abcdefghi", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abcd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04"},
+ {"d68c19a0a345b7eab78d5e11e991c026ec60db63", "abcdefghij", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0abcde\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05"},
+ {"ebf81ddcbe5bf13aaabdc4d65354fdf2044f38a7", "Discard medicine more than two years old.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Discard medicine mor\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14"},
+ {"e5dea09392dd886ca63531aaa00571dc07554bb6", "He who has a shady past knows that nice guys finish last.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0He who has a shady past know\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c"},
+ {"45988f7234467b94e3e9494434c96ee3609d8f8f", "I wouldn't marry him with a ten foot pole.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0I wouldn't marry him \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x15"},
+ {"55dee037eb7460d5a692d1ce11330b260e40c988", "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Free! Free!/A trip/to Mars/f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c"},
+ {"b7bc5fb91080c7de6b582ea281f8a396d7c0aee8", "The days of the digital watch are numbered. -Tom Stoppard", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0The days of the digital watch\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1d"},
+ {"c3aed9358f7c77f523afe86135f06b95b3999797", "Nepal premier won't resign.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Nepal premier\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\r"},
+ {"6e29d302bf6e3a5e4305ff318d983197d6906bb9", "For every action there is an equal and opposite government program.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0For every action there is an equa\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00!"},
+ {"597f6a540010f94c15d71806a99a2c8710e747bd", "His money is twice tainted: 'taint yours and 'taint mine.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0His money is twice tainted: \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c"},
+ {"6859733b2590a8a091cecf50086febc5ceef1e80", "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0There is no reason for any individual to hav\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,"},
+ {"514b2630ec089b8aee18795fc0cf1f4860cdacad", "It's a tiny change to the code and not completely disgusting. - Bob Manchek", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0It's a tiny change to the code and no\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00%"},
+ {"c5ca0d4a7b6676fc7aa72caa41cc3d5df567ed69", "size: a.out: bad magic", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0size: a.out\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\f"},
+ {"74c51fa9a04eadc8c1bbeaa7fc442f834b90a00a", "The major problem is with sendmail. -Mark Horton", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0The major problem is wit\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18"},
+ {"0b4c4ce5f52c3ad2821852a8dc00217fa18b8b66", "Give me a rock, paper and scissors and I will move the world. CCFestoon", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Give me a rock, paper and scissors a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$"},
+ {"3ae7937dd790315beb0f48330e8642237c61550a", "If the enemy is within range, then so are you.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0If the enemy is within \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17"},
+ {"410a2b296df92b9a47412b13281df8f830a9f44b", "It's well we cannot hear the screams/That we create in others' dreams.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0It's well we cannot hear the scream\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00#"},
+ {"841e7c85ca1adcddbdd0187f1289acb5c642f7f5", "You remind me of a TV show, but that's all right: I watch it anyway.", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0You remind me of a TV show, but th\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\""},
+ {"163173b825d03b952601376b25212df66763e1db", "C is as portable as Stonehedge!!", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0C is as portable\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10"},
+ {"32b0377f2687eb88e22106f133c586ab314d5279", "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0Even if I could be Shakespeare, I think I sh\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00,"},
+ {"0885aaf99b569542fd165fa44e322718f4a984e0", "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule", "sha\x01x}\xf4\r\xeb\xf2\x10\x87\xe8[\xb2JA$D\xb7\u063ax8em\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00B"},
+ {"6627d6904d71420b0bf3886ab629623538689f45", "How can you write a big system without C++? -Paul Glick", "sha\x01gE#\x01\xef\u036b\x89\x98\xba\xdc\xfe\x102Tv\xc3\xd2\xe1\xf0How can you write a big syst\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1c"},
+}
+
+func TestGolden(t *testing.T) {
+ for i := 0; i < len(golden); i++ {
+ g := golden[i]
+ s := fmt.Sprintf("%x", Sum([]byte(g.in)))
+ if s != g.out {
+ t.Fatalf("Sum function: sha1(%s) = %s want %s", g.in, s, g.out)
+ }
+ c := New()
+ for j := 0; j < 4; j++ {
+ var sum []byte
+ switch j {
+ case 0, 1:
+ io.WriteString(c, g.in)
+ sum = c.Sum(nil)
+ case 2:
+ io.WriteString(c, g.in[0:len(g.in)/2])
+ c.Sum(nil)
+ io.WriteString(c, g.in[len(g.in)/2:])
+ sum = c.Sum(nil)
+ case 3:
+ io.WriteString(c, g.in[0:len(g.in)/2])
+ c.(*digest).ConstantTimeSum(nil)
+ io.WriteString(c, g.in[len(g.in)/2:])
+ sum = c.(*digest).ConstantTimeSum(nil)
+ }
+ s := fmt.Sprintf("%x", sum)
+ if s != g.out {
+ t.Fatalf("sha1[%d](%s) = %s want %s", j, g.in, s, g.out)
+ }
+ c.Reset()
+ }
+ }
+}
+
+func TestGoldenMarshal(t *testing.T) {
+ h := New()
+ h2 := New()
+ for _, g := range golden {
+ h.Reset()
+ h2.Reset()
+
+ io.WriteString(h, g.in[:len(g.in)/2])
+
+ state, err := h.(encoding.BinaryMarshaler).MarshalBinary()
+ if err != nil {
+ t.Errorf("could not marshal: %v", err)
+ continue
+ }
+
+ if string(state) != g.halfState {
+ t.Errorf("sha1(%q) state = %+q, want %+q", g.in, state, g.halfState)
+ continue
+ }
+
+ if err := h2.(encoding.BinaryUnmarshaler).UnmarshalBinary(state); err != nil {
+ t.Errorf("could not unmarshal: %v", err)
+ continue
+ }
+
+ io.WriteString(h, g.in[len(g.in)/2:])
+ io.WriteString(h2, g.in[len(g.in)/2:])
+
+ if actual, actual2 := h.Sum(nil), h2.Sum(nil); !bytes.Equal(actual, actual2) {
+ t.Errorf("sha1(%q) = 0x%x != marshaled 0x%x", g.in, actual, actual2)
+ }
+ }
+}
+
+func TestSize(t *testing.T) {
+ c := New()
+ if got := c.Size(); got != Size {
+ t.Errorf("Size = %d; want %d", got, Size)
+ }
+}
+
+func TestBlockSize(t *testing.T) {
+ c := New()
+ if got := c.BlockSize(); got != BlockSize {
+ t.Errorf("BlockSize = %d; want %d", got, BlockSize)
+ }
+}
+
+// Tests that blockGeneric (pure Go) and block (in assembly for some architectures) match.
+func TestBlockGeneric(t *testing.T) {
+ for i := 1; i < 30; i++ { // arbitrary factor
+ gen, asm := New().(*digest), New().(*digest)
+ buf := make([]byte, BlockSize*i)
+ rand.Read(buf)
+ blockGeneric(gen, buf)
+ block(asm, buf)
+ if *gen != *asm {
+ t.Errorf("For %#v block and blockGeneric resulted in different states", buf)
+ }
+ }
+}
+
+// Tests for unmarshaling hashes that have hashed a large amount of data
+// The initial hash generation is omitted from the test, because it takes a long time.
+// The test contains some already-generated states, and their expected sums
+// Tests a problem that is outlined in GitHub issue #29543
+// The problem is triggered when an amount of data has been hashed for which
+// the data length has a 1 in the 32nd bit. When casted to int, this changes
+// the sign of the value, and causes the modulus operation to return a
+// different result.
+type unmarshalTest struct {
+ state string
+ sum string
+}
+
+var largeUnmarshalTests = []unmarshalTest{
+ // Data length: 7_102_415_735
+ {
+ state: "sha\x01\x13\xbc\xfe\x83\x8c\xbd\xdfP\x1f\xd8ڿ<\x9eji8t\xe1\xa5@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuv\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xa7VCw",
+ sum: "bc6245c9959cc33e1c2592e5c9ea9b5d0431246c",
+ },
+ // Data length: 6_565_544_823
+ {
+ state: "sha\x01m;\x16\xa6R\xbe@\xa9nĈ\xf9S\x03\x00B\xc2\xdcv\xcf@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuv\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x87VCw",
+ sum: "8f2d1c0e4271768f35feb918bfe21ea1387a2072",
+ },
+}
+
+func safeSum(h hash.Hash) (sum []byte, err error) {
+ defer func() {
+ if r := recover(); r != nil {
+ err = fmt.Errorf("sum panic: %v", r)
+ }
+ }()
+
+ return h.Sum(nil), nil
+}
+
+func TestLargeHashes(t *testing.T) {
+ for i, test := range largeUnmarshalTests {
+
+ h := New()
+ if err := h.(encoding.BinaryUnmarshaler).UnmarshalBinary([]byte(test.state)); err != nil {
+ t.Errorf("test %d could not unmarshal: %v", i, err)
+ continue
+ }
+
+ sum, err := safeSum(h)
+ if err != nil {
+ t.Errorf("test %d could not sum: %v", i, err)
+ continue
+ }
+
+ if fmt.Sprintf("%x", sum) != test.sum {
+ t.Errorf("test %d sum mismatch: expect %s got %x", i, test.sum, sum)
+ }
+ }
+}
+
+var bench = New()
+var buf = make([]byte, 8192)
+
+func benchmarkSize(b *testing.B, size int) {
+ b.SetBytes(int64(size))
+ sum := make([]byte, bench.Size())
+ for i := 0; i < b.N; i++ {
+ bench.Reset()
+ bench.Write(buf[:size])
+ bench.Sum(sum[:0])
+ }
+}
+
+func BenchmarkHash8Bytes(b *testing.B) {
+ benchmarkSize(b, 8)
+}
+
+func BenchmarkHash320Bytes(b *testing.B) {
+ benchmarkSize(b, 320)
+}
+
+func BenchmarkHash1K(b *testing.B) {
+ benchmarkSize(b, 1024)
+}
+
+func BenchmarkHash8K(b *testing.B) {
+ benchmarkSize(b, 8192)
+}
diff --git a/src/crypto/sha1/sha1block.go b/src/crypto/sha1/sha1block.go
new file mode 100644
index 0000000..321d343
--- /dev/null
+++ b/src/crypto/sha1/sha1block.go
@@ -0,0 +1,83 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1
+
+import (
+ "math/bits"
+)
+
+const (
+ _K0 = 0x5A827999
+ _K1 = 0x6ED9EBA1
+ _K2 = 0x8F1BBCDC
+ _K3 = 0xCA62C1D6
+)
+
+// blockGeneric is a portable, pure Go version of the SHA-1 block step.
+// It's used by sha1block_generic.go and tests.
+func blockGeneric(dig *digest, p []byte) {
+ var w [16]uint32
+
+ h0, h1, h2, h3, h4 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4]
+ for len(p) >= chunk {
+ // Can interlace the computation of w with the
+ // rounds below if needed for speed.
+ for i := 0; i < 16; i++ {
+ j := i * 4
+ w[i] = uint32(p[j])<<24 | uint32(p[j+1])<<16 | uint32(p[j+2])<<8 | uint32(p[j+3])
+ }
+
+ a, b, c, d, e := h0, h1, h2, h3, h4
+
+ // Each of the four 20-iteration rounds
+ // differs only in the computation of f and
+ // the choice of K (_K0, _K1, etc).
+ i := 0
+ for ; i < 16; i++ {
+ f := b&c | (^b)&d
+ t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K0
+ a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+ }
+ for ; i < 20; i++ {
+ tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+ w[i&0xf] = tmp<<1 | tmp>>(32-1)
+
+ f := b&c | (^b)&d
+ t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K0
+ a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+ }
+ for ; i < 40; i++ {
+ tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+ w[i&0xf] = tmp<<1 | tmp>>(32-1)
+ f := b ^ c ^ d
+ t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K1
+ a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+ }
+ for ; i < 60; i++ {
+ tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+ w[i&0xf] = tmp<<1 | tmp>>(32-1)
+ f := ((b | c) & d) | (b & c)
+ t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K2
+ a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+ }
+ for ; i < 80; i++ {
+ tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+ w[i&0xf] = tmp<<1 | tmp>>(32-1)
+ f := b ^ c ^ d
+ t := bits.RotateLeft32(a, 5) + f + e + w[i&0xf] + _K3
+ a, b, c, d, e = t, a, bits.RotateLeft32(b, 30), c, d
+ }
+
+ h0 += a
+ h1 += b
+ h2 += c
+ h3 += d
+ h4 += e
+
+ p = p[chunk:]
+ }
+
+ dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4] = h0, h1, h2, h3, h4
+}
diff --git a/src/crypto/sha1/sha1block_386.s b/src/crypto/sha1/sha1block_386.s
new file mode 100644
index 0000000..34d023d
--- /dev/null
+++ b/src/crypto/sha1/sha1block_386.s
@@ -0,0 +1,233 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// SHA-1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+// - rounds 0-15 are type 1 and load data (ROUND1 macro).
+// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+
+// Like sha1block_amd64.s, but we keep the data and limit pointers on the stack.
+// To free up the word pointer (R10 on amd64, DI here), we add it to e during
+// LOAD/SHUFFLE instead of during MIX.
+//
+// The stack holds the intermediate word array - 16 uint32s - at 0(SP) up to 64(SP).
+// The saved a, b, c, d, e (R11 through R15 on amd64) are at 64(SP) up to 84(SP).
+// The saved limit pointer (DI on amd64) is at 84(SP).
+// The saved data pointer (SI on amd64) is at 88(SP).
+
+#define LOAD(index, e) \
+ MOVL 88(SP), SI; \
+ MOVL (index*4)(SI), DI; \
+ BSWAPL DI; \
+ MOVL DI, (index*4)(SP); \
+ ADDL DI, e
+
+#define SHUFFLE(index, e) \
+ MOVL (((index)&0xf)*4)(SP), DI; \
+ XORL (((index-3)&0xf)*4)(SP), DI; \
+ XORL (((index-8)&0xf)*4)(SP), DI; \
+ XORL (((index-14)&0xf)*4)(SP), DI; \
+ ROLL $1, DI; \
+ MOVL DI, (((index)&0xf)*4)(SP); \
+ ADDL DI, e
+
+#define FUNC1(a, b, c, d, e) \
+ MOVL d, DI; \
+ XORL c, DI; \
+ ANDL b, DI; \
+ XORL d, DI
+
+#define FUNC2(a, b, c, d, e) \
+ MOVL b, DI; \
+ XORL c, DI; \
+ XORL d, DI
+
+#define FUNC3(a, b, c, d, e) \
+ MOVL b, SI; \
+ ORL c, SI; \
+ ANDL d, SI; \
+ MOVL b, DI; \
+ ANDL c, DI; \
+ ORL SI, DI
+
+#define FUNC4 FUNC2
+
+#define MIX(a, b, c, d, e, const) \
+ ROLL $30, b; \
+ ADDL DI, e; \
+ MOVL a, SI; \
+ ROLL $5, SI; \
+ LEAL const(e)(SI*1), e
+
+#define ROUND1(a, b, c, d, e, index) \
+ LOAD(index, e); \
+ FUNC1(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND1x(a, b, c, d, e, index) \
+ SHUFFLE(index, e); \
+ FUNC1(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND2(a, b, c, d, e, index) \
+ SHUFFLE(index, e); \
+ FUNC2(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x6ED9EBA1)
+
+#define ROUND3(a, b, c, d, e, index) \
+ SHUFFLE(index, e); \
+ FUNC3(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x8F1BBCDC)
+
+#define ROUND4(a, b, c, d, e, index) \
+ SHUFFLE(index, e); \
+ FUNC4(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0xCA62C1D6)
+
+// func block(dig *digest, p []byte)
+TEXT ·block(SB),NOSPLIT,$92-16
+ MOVL dig+0(FP), BP
+ MOVL p+4(FP), SI
+ MOVL p_len+8(FP), DX
+ SHRL $6, DX
+ SHLL $6, DX
+
+ LEAL (SI)(DX*1), DI
+ MOVL (0*4)(BP), AX
+ MOVL (1*4)(BP), BX
+ MOVL (2*4)(BP), CX
+ MOVL (3*4)(BP), DX
+ MOVL (4*4)(BP), BP
+
+ CMPL SI, DI
+ JEQ end
+
+ MOVL DI, 84(SP)
+
+loop:
+ MOVL SI, 88(SP)
+
+ MOVL AX, 64(SP)
+ MOVL BX, 68(SP)
+ MOVL CX, 72(SP)
+ MOVL DX, 76(SP)
+ MOVL BP, 80(SP)
+
+ ROUND1(AX, BX, CX, DX, BP, 0)
+ ROUND1(BP, AX, BX, CX, DX, 1)
+ ROUND1(DX, BP, AX, BX, CX, 2)
+ ROUND1(CX, DX, BP, AX, BX, 3)
+ ROUND1(BX, CX, DX, BP, AX, 4)
+ ROUND1(AX, BX, CX, DX, BP, 5)
+ ROUND1(BP, AX, BX, CX, DX, 6)
+ ROUND1(DX, BP, AX, BX, CX, 7)
+ ROUND1(CX, DX, BP, AX, BX, 8)
+ ROUND1(BX, CX, DX, BP, AX, 9)
+ ROUND1(AX, BX, CX, DX, BP, 10)
+ ROUND1(BP, AX, BX, CX, DX, 11)
+ ROUND1(DX, BP, AX, BX, CX, 12)
+ ROUND1(CX, DX, BP, AX, BX, 13)
+ ROUND1(BX, CX, DX, BP, AX, 14)
+ ROUND1(AX, BX, CX, DX, BP, 15)
+
+ ROUND1x(BP, AX, BX, CX, DX, 16)
+ ROUND1x(DX, BP, AX, BX, CX, 17)
+ ROUND1x(CX, DX, BP, AX, BX, 18)
+ ROUND1x(BX, CX, DX, BP, AX, 19)
+
+ ROUND2(AX, BX, CX, DX, BP, 20)
+ ROUND2(BP, AX, BX, CX, DX, 21)
+ ROUND2(DX, BP, AX, BX, CX, 22)
+ ROUND2(CX, DX, BP, AX, BX, 23)
+ ROUND2(BX, CX, DX, BP, AX, 24)
+ ROUND2(AX, BX, CX, DX, BP, 25)
+ ROUND2(BP, AX, BX, CX, DX, 26)
+ ROUND2(DX, BP, AX, BX, CX, 27)
+ ROUND2(CX, DX, BP, AX, BX, 28)
+ ROUND2(BX, CX, DX, BP, AX, 29)
+ ROUND2(AX, BX, CX, DX, BP, 30)
+ ROUND2(BP, AX, BX, CX, DX, 31)
+ ROUND2(DX, BP, AX, BX, CX, 32)
+ ROUND2(CX, DX, BP, AX, BX, 33)
+ ROUND2(BX, CX, DX, BP, AX, 34)
+ ROUND2(AX, BX, CX, DX, BP, 35)
+ ROUND2(BP, AX, BX, CX, DX, 36)
+ ROUND2(DX, BP, AX, BX, CX, 37)
+ ROUND2(CX, DX, BP, AX, BX, 38)
+ ROUND2(BX, CX, DX, BP, AX, 39)
+
+ ROUND3(AX, BX, CX, DX, BP, 40)
+ ROUND3(BP, AX, BX, CX, DX, 41)
+ ROUND3(DX, BP, AX, BX, CX, 42)
+ ROUND3(CX, DX, BP, AX, BX, 43)
+ ROUND3(BX, CX, DX, BP, AX, 44)
+ ROUND3(AX, BX, CX, DX, BP, 45)
+ ROUND3(BP, AX, BX, CX, DX, 46)
+ ROUND3(DX, BP, AX, BX, CX, 47)
+ ROUND3(CX, DX, BP, AX, BX, 48)
+ ROUND3(BX, CX, DX, BP, AX, 49)
+ ROUND3(AX, BX, CX, DX, BP, 50)
+ ROUND3(BP, AX, BX, CX, DX, 51)
+ ROUND3(DX, BP, AX, BX, CX, 52)
+ ROUND3(CX, DX, BP, AX, BX, 53)
+ ROUND3(BX, CX, DX, BP, AX, 54)
+ ROUND3(AX, BX, CX, DX, BP, 55)
+ ROUND3(BP, AX, BX, CX, DX, 56)
+ ROUND3(DX, BP, AX, BX, CX, 57)
+ ROUND3(CX, DX, BP, AX, BX, 58)
+ ROUND3(BX, CX, DX, BP, AX, 59)
+
+ ROUND4(AX, BX, CX, DX, BP, 60)
+ ROUND4(BP, AX, BX, CX, DX, 61)
+ ROUND4(DX, BP, AX, BX, CX, 62)
+ ROUND4(CX, DX, BP, AX, BX, 63)
+ ROUND4(BX, CX, DX, BP, AX, 64)
+ ROUND4(AX, BX, CX, DX, BP, 65)
+ ROUND4(BP, AX, BX, CX, DX, 66)
+ ROUND4(DX, BP, AX, BX, CX, 67)
+ ROUND4(CX, DX, BP, AX, BX, 68)
+ ROUND4(BX, CX, DX, BP, AX, 69)
+ ROUND4(AX, BX, CX, DX, BP, 70)
+ ROUND4(BP, AX, BX, CX, DX, 71)
+ ROUND4(DX, BP, AX, BX, CX, 72)
+ ROUND4(CX, DX, BP, AX, BX, 73)
+ ROUND4(BX, CX, DX, BP, AX, 74)
+ ROUND4(AX, BX, CX, DX, BP, 75)
+ ROUND4(BP, AX, BX, CX, DX, 76)
+ ROUND4(DX, BP, AX, BX, CX, 77)
+ ROUND4(CX, DX, BP, AX, BX, 78)
+ ROUND4(BX, CX, DX, BP, AX, 79)
+
+ ADDL 64(SP), AX
+ ADDL 68(SP), BX
+ ADDL 72(SP), CX
+ ADDL 76(SP), DX
+ ADDL 80(SP), BP
+
+ MOVL 88(SP), SI
+ ADDL $64, SI
+ CMPL SI, 84(SP)
+ JB loop
+
+end:
+ MOVL dig+0(FP), DI
+ MOVL AX, (0*4)(DI)
+ MOVL BX, (1*4)(DI)
+ MOVL CX, (2*4)(DI)
+ MOVL DX, (3*4)(DI)
+ MOVL BP, (4*4)(DI)
+ RET
diff --git a/src/crypto/sha1/sha1block_amd64.go b/src/crypto/sha1/sha1block_amd64.go
new file mode 100644
index 0000000..039813d
--- /dev/null
+++ b/src/crypto/sha1/sha1block_amd64.go
@@ -0,0 +1,34 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1
+
+import "internal/cpu"
+
+//go:noescape
+func blockAVX2(dig *digest, p []byte)
+
+//go:noescape
+func blockAMD64(dig *digest, p []byte)
+
+var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI1 && cpu.X86.HasBMI2
+
+func block(dig *digest, p []byte) {
+ if useAVX2 && len(p) >= 256 {
+ // blockAVX2 calculates sha1 for 2 block per iteration
+ // it also interleaves precalculation for next block.
+ // So it may read up-to 192 bytes past end of p
+ // We may add checks inside blockAVX2, but this will
+ // just turn it into a copy of blockAMD64,
+ // so call it directly, instead.
+ safeLen := len(p) - 128
+ if safeLen%128 != 0 {
+ safeLen -= 64
+ }
+ blockAVX2(dig, p[:safeLen])
+ blockAMD64(dig, p[safeLen:])
+ } else {
+ blockAMD64(dig, p)
+ }
+}
diff --git a/src/crypto/sha1/sha1block_amd64.s b/src/crypto/sha1/sha1block_amd64.s
new file mode 100644
index 0000000..42f03fb
--- /dev/null
+++ b/src/crypto/sha1/sha1block_amd64.s
@@ -0,0 +1,1500 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// AVX2 version by Intel, same algorithm as code in Linux kernel:
+// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+// Authors:
+// Ilya Albrekht <ilya.albrekht@intel.com>
+// Maxim Locktyukhin <maxim.locktyukhin@intel.com>
+// Ronen Zohar <ronen.zohar@intel.com>
+// Chandramouli Narayanan <mouli@linux.intel.com>
+
+
+#include "textflag.h"
+
+// SHA-1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+// - rounds 0-15 are type 1 and load data (ROUND1 macro).
+// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+
+#define LOAD(index) \
+ MOVL (index*4)(SI), R10; \
+ BSWAPL R10; \
+ MOVL R10, (index*4)(SP)
+
+#define SHUFFLE(index) \
+ MOVL (((index)&0xf)*4)(SP), R10; \
+ XORL (((index-3)&0xf)*4)(SP), R10; \
+ XORL (((index-8)&0xf)*4)(SP), R10; \
+ XORL (((index-14)&0xf)*4)(SP), R10; \
+ ROLL $1, R10; \
+ MOVL R10, (((index)&0xf)*4)(SP)
+
+#define FUNC1(a, b, c, d, e) \
+ MOVL d, R9; \
+ XORL c, R9; \
+ ANDL b, R9; \
+ XORL d, R9
+
+#define FUNC2(a, b, c, d, e) \
+ MOVL b, R9; \
+ XORL c, R9; \
+ XORL d, R9
+
+#define FUNC3(a, b, c, d, e) \
+ MOVL b, R8; \
+ ORL c, R8; \
+ ANDL d, R8; \
+ MOVL b, R9; \
+ ANDL c, R9; \
+ ORL R8, R9
+
+#define FUNC4 FUNC2
+
+#define MIX(a, b, c, d, e, const) \
+ ROLL $30, b; \
+ ADDL R9, e; \
+ MOVL a, R8; \
+ ROLL $5, R8; \
+ LEAL const(e)(R10*1), e; \
+ ADDL R8, e
+
+#define ROUND1(a, b, c, d, e, index) \
+ LOAD(index); \
+ FUNC1(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND1x(a, b, c, d, e, index) \
+ SHUFFLE(index); \
+ FUNC1(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x5A827999)
+
+#define ROUND2(a, b, c, d, e, index) \
+ SHUFFLE(index); \
+ FUNC2(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x6ED9EBA1)
+
+#define ROUND3(a, b, c, d, e, index) \
+ SHUFFLE(index); \
+ FUNC3(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0x8F1BBCDC)
+
+#define ROUND4(a, b, c, d, e, index) \
+ SHUFFLE(index); \
+ FUNC4(a, b, c, d, e); \
+ MIX(a, b, c, d, e, 0xCA62C1D6)
+
+TEXT ·blockAMD64(SB),NOSPLIT,$64-32
+ MOVQ dig+0(FP), BP
+ MOVQ p_base+8(FP), SI
+ MOVQ p_len+16(FP), DX
+ SHRQ $6, DX
+ SHLQ $6, DX
+
+ LEAQ (SI)(DX*1), DI
+ MOVL (0*4)(BP), AX
+ MOVL (1*4)(BP), BX
+ MOVL (2*4)(BP), CX
+ MOVL (3*4)(BP), DX
+ MOVL (4*4)(BP), BP
+
+ CMPQ SI, DI
+ JEQ end
+
+loop:
+ MOVL AX, R11
+ MOVL BX, R12
+ MOVL CX, R13
+ MOVL DX, R14
+ MOVL BP, R15
+
+ ROUND1(AX, BX, CX, DX, BP, 0)
+ ROUND1(BP, AX, BX, CX, DX, 1)
+ ROUND1(DX, BP, AX, BX, CX, 2)
+ ROUND1(CX, DX, BP, AX, BX, 3)
+ ROUND1(BX, CX, DX, BP, AX, 4)
+ ROUND1(AX, BX, CX, DX, BP, 5)
+ ROUND1(BP, AX, BX, CX, DX, 6)
+ ROUND1(DX, BP, AX, BX, CX, 7)
+ ROUND1(CX, DX, BP, AX, BX, 8)
+ ROUND1(BX, CX, DX, BP, AX, 9)
+ ROUND1(AX, BX, CX, DX, BP, 10)
+ ROUND1(BP, AX, BX, CX, DX, 11)
+ ROUND1(DX, BP, AX, BX, CX, 12)
+ ROUND1(CX, DX, BP, AX, BX, 13)
+ ROUND1(BX, CX, DX, BP, AX, 14)
+ ROUND1(AX, BX, CX, DX, BP, 15)
+
+ ROUND1x(BP, AX, BX, CX, DX, 16)
+ ROUND1x(DX, BP, AX, BX, CX, 17)
+ ROUND1x(CX, DX, BP, AX, BX, 18)
+ ROUND1x(BX, CX, DX, BP, AX, 19)
+
+ ROUND2(AX, BX, CX, DX, BP, 20)
+ ROUND2(BP, AX, BX, CX, DX, 21)
+ ROUND2(DX, BP, AX, BX, CX, 22)
+ ROUND2(CX, DX, BP, AX, BX, 23)
+ ROUND2(BX, CX, DX, BP, AX, 24)
+ ROUND2(AX, BX, CX, DX, BP, 25)
+ ROUND2(BP, AX, BX, CX, DX, 26)
+ ROUND2(DX, BP, AX, BX, CX, 27)
+ ROUND2(CX, DX, BP, AX, BX, 28)
+ ROUND2(BX, CX, DX, BP, AX, 29)
+ ROUND2(AX, BX, CX, DX, BP, 30)
+ ROUND2(BP, AX, BX, CX, DX, 31)
+ ROUND2(DX, BP, AX, BX, CX, 32)
+ ROUND2(CX, DX, BP, AX, BX, 33)
+ ROUND2(BX, CX, DX, BP, AX, 34)
+ ROUND2(AX, BX, CX, DX, BP, 35)
+ ROUND2(BP, AX, BX, CX, DX, 36)
+ ROUND2(DX, BP, AX, BX, CX, 37)
+ ROUND2(CX, DX, BP, AX, BX, 38)
+ ROUND2(BX, CX, DX, BP, AX, 39)
+
+ ROUND3(AX, BX, CX, DX, BP, 40)
+ ROUND3(BP, AX, BX, CX, DX, 41)
+ ROUND3(DX, BP, AX, BX, CX, 42)
+ ROUND3(CX, DX, BP, AX, BX, 43)
+ ROUND3(BX, CX, DX, BP, AX, 44)
+ ROUND3(AX, BX, CX, DX, BP, 45)
+ ROUND3(BP, AX, BX, CX, DX, 46)
+ ROUND3(DX, BP, AX, BX, CX, 47)
+ ROUND3(CX, DX, BP, AX, BX, 48)
+ ROUND3(BX, CX, DX, BP, AX, 49)
+ ROUND3(AX, BX, CX, DX, BP, 50)
+ ROUND3(BP, AX, BX, CX, DX, 51)
+ ROUND3(DX, BP, AX, BX, CX, 52)
+ ROUND3(CX, DX, BP, AX, BX, 53)
+ ROUND3(BX, CX, DX, BP, AX, 54)
+ ROUND3(AX, BX, CX, DX, BP, 55)
+ ROUND3(BP, AX, BX, CX, DX, 56)
+ ROUND3(DX, BP, AX, BX, CX, 57)
+ ROUND3(CX, DX, BP, AX, BX, 58)
+ ROUND3(BX, CX, DX, BP, AX, 59)
+
+ ROUND4(AX, BX, CX, DX, BP, 60)
+ ROUND4(BP, AX, BX, CX, DX, 61)
+ ROUND4(DX, BP, AX, BX, CX, 62)
+ ROUND4(CX, DX, BP, AX, BX, 63)
+ ROUND4(BX, CX, DX, BP, AX, 64)
+ ROUND4(AX, BX, CX, DX, BP, 65)
+ ROUND4(BP, AX, BX, CX, DX, 66)
+ ROUND4(DX, BP, AX, BX, CX, 67)
+ ROUND4(CX, DX, BP, AX, BX, 68)
+ ROUND4(BX, CX, DX, BP, AX, 69)
+ ROUND4(AX, BX, CX, DX, BP, 70)
+ ROUND4(BP, AX, BX, CX, DX, 71)
+ ROUND4(DX, BP, AX, BX, CX, 72)
+ ROUND4(CX, DX, BP, AX, BX, 73)
+ ROUND4(BX, CX, DX, BP, AX, 74)
+ ROUND4(AX, BX, CX, DX, BP, 75)
+ ROUND4(BP, AX, BX, CX, DX, 76)
+ ROUND4(DX, BP, AX, BX, CX, 77)
+ ROUND4(CX, DX, BP, AX, BX, 78)
+ ROUND4(BX, CX, DX, BP, AX, 79)
+
+ ADDL R11, AX
+ ADDL R12, BX
+ ADDL R13, CX
+ ADDL R14, DX
+ ADDL R15, BP
+
+ ADDQ $64, SI
+ CMPQ SI, DI
+ JB loop
+
+end:
+ MOVQ dig+0(FP), DI
+ MOVL AX, (0*4)(DI)
+ MOVL BX, (1*4)(DI)
+ MOVL CX, (2*4)(DI)
+ MOVL DX, (3*4)(DI)
+ MOVL BP, (4*4)(DI)
+ RET
+
+
+// This is the implementation using AVX2, BMI1 and BMI2. It is based on:
+// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
+// From http://software.intel.com/en-us/articles
+// (look for improving-the-performance-of-the-secure-hash-algorithm-1)
+// This implementation is 2x unrolled, and interleaves vector instructions,
+// used to precompute W, with scalar computation of current round
+// for optimal scheduling.
+
+// Trivial helper macros.
+#define UPDATE_HASH(A,TB,C,D,E) \
+ ADDL (R9), A \
+ MOVL A, (R9) \
+ ADDL 4(R9), TB \
+ MOVL TB, 4(R9) \
+ ADDL 8(R9), C \
+ MOVL C, 8(R9) \
+ ADDL 12(R9), D \
+ MOVL D, 12(R9) \
+ ADDL 16(R9), E \
+ MOVL E, 16(R9)
+
+
+
+// Helper macros for PRECALC, which does precomputations
+#define PRECALC_0(OFFSET) \
+ VMOVDQU OFFSET(R10),X0
+
+#define PRECALC_1(OFFSET) \
+ VINSERTI128 $1, OFFSET(R13), Y0, Y0
+
+#define PRECALC_2(YREG) \
+ VPSHUFB Y10, Y0, YREG
+
+#define PRECALC_4(YREG,K_OFFSET) \
+ VPADDD K_OFFSET(R8), YREG, Y0
+
+#define PRECALC_7(OFFSET) \
+ VMOVDQU Y0, (OFFSET*2)(R14)
+
+
+// Message scheduling pre-compute for rounds 0-15
+// R13 is a pointer to even 64-byte block
+// R10 is a pointer to odd 64-byte block
+// R14 is a pointer to temp buffer
+// X0 is used as temp register
+// YREG is clobbered as part of computation
+// OFFSET chooses 16 byte chunk within a block
+// R8 is a pointer to constants block
+// K_OFFSET chooses K constants relevant to this round
+// X10 holds swap mask
+#define PRECALC_00_15(OFFSET,YREG) \
+ PRECALC_0(OFFSET) \
+ PRECALC_1(OFFSET) \
+ PRECALC_2(YREG) \
+ PRECALC_4(YREG,0x0) \
+ PRECALC_7(OFFSET)
+
+
+// Helper macros for PRECALC_16_31
+#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
+ VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14]
+ VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3]
+
+#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
+ VPXOR REG_SUB_8, REG, REG \
+ VPXOR REG_SUB_16, Y0, Y0
+
+#define PRECALC_18(REG) \
+ VPXOR Y0, REG, REG \
+ VPSLLDQ $12, REG, Y9
+
+#define PRECALC_19(REG) \
+ VPSLLD $1, REG, Y0 \
+ VPSRLD $31, REG, REG
+
+#define PRECALC_20(REG) \
+ VPOR REG, Y0, Y0 \
+ VPSLLD $2, Y9, REG
+
+#define PRECALC_21(REG) \
+ VPSRLD $30, Y9, Y9 \
+ VPXOR REG, Y0, Y0
+
+#define PRECALC_23(REG,K_OFFSET,OFFSET) \
+ VPXOR Y9, Y0, REG \
+ VPADDD K_OFFSET(R8), REG, Y0 \
+ VMOVDQU Y0, (OFFSET)(R14)
+
+// Message scheduling pre-compute for rounds 16-31
+// calculating last 32 w[i] values in 8 XMM registers
+// pre-calculate K+w[i] values and store to mem
+// for later load by ALU add instruction.
+// "brute force" vectorization for rounds 16-31 only
+// due to w[i]->w[i-3] dependency.
+// clobbers 5 input ymm registers REG_SUB*
+// uses X0 and X9 as temp registers
+// As always, R8 is a pointer to constants block
+// and R14 is a pointer to temp buffer
+#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \
+ PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \
+ PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \
+ PRECALC_18(REG) \
+ PRECALC_19(REG) \
+ PRECALC_20(REG) \
+ PRECALC_21(REG) \
+ PRECALC_23(REG,K_OFFSET,OFFSET)
+
+
+// Helper macros for PRECALC_32_79
+#define PRECALC_32(REG_SUB_8,REG_SUB_4) \
+ VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0
+
+#define PRECALC_33(REG_SUB_28,REG) \
+ VPXOR REG_SUB_28, REG, REG
+
+#define PRECALC_34(REG_SUB_16) \
+ VPXOR REG_SUB_16, Y0, Y0
+
+#define PRECALC_35(REG) \
+ VPXOR Y0, REG, REG
+
+#define PRECALC_36(REG) \
+ VPSLLD $2, REG, Y0
+
+#define PRECALC_37(REG) \
+ VPSRLD $30, REG, REG \
+ VPOR REG, Y0, REG
+
+#define PRECALC_39(REG,K_OFFSET,OFFSET) \
+ VPADDD K_OFFSET(R8), REG, Y0 \
+ VMOVDQU Y0, (OFFSET)(R14)
+
+// Message scheduling pre-compute for rounds 32-79
+// In SHA-1 specification we have:
+// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+// Which is the same as:
+// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+// This allows for more efficient vectorization,
+// since w[i]->w[i-3] dependency is broken
+#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \
+ PRECALC_32(REG_SUB_8,REG_SUB_4) \
+ PRECALC_33(REG_SUB_28,REG) \
+ PRECALC_34(REG_SUB_16) \
+ PRECALC_35(REG) \
+ PRECALC_36(REG) \
+ PRECALC_37(REG) \
+ PRECALC_39(REG,K_OFFSET,OFFSET)
+
+#define PRECALC \
+ PRECALC_00_15(0,Y15) \
+ PRECALC_00_15(0x10,Y14) \
+ PRECALC_00_15(0x20,Y13) \
+ PRECALC_00_15(0x30,Y12) \
+ PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \
+ PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \
+ PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \
+ PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \
+ PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \
+ PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \
+ PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \
+ PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \
+ PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \
+ PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \
+ PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \
+ PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \
+ PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \
+ PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \
+ PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \
+ PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260)
+
+// Macros calculating individual rounds have general form
+// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
+// CALC_ROUND_{PRE,POST} macros follow
+
+#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \
+ ADDL OFFSET(R15),REG_E \
+ ANDNL REG_C,REG_A,BP \
+ LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
+ RORXL $0x1b, REG_A, R12 \
+ RORXL $2, REG_A, REG_B // for next round
+
+// Calculate F for the next round
+#define CALC_F1_POST(REG_A,REG_B,REG_E) \
+ ANDL REG_B,REG_A \ // b&c
+ XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d)
+ LEAL (REG_E)(R12*1), REG_E // E += A >>> 5
+
+
+// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
+#define CALC_0 \
+ MOVL SI, BX \ // Precalculating first round
+ RORXL $2, SI, SI \
+ ANDNL AX, BX, BP \
+ ANDL DI, BX \
+ XORL BP, BX \
+ CALC_F1_PRE(0x0,CX,BX,DI,DX) \
+ PRECALC_0(0x80) \
+ CALC_F1_POST(CX,SI,DX)
+
+#define CALC_1 \
+ CALC_F1_PRE(0x4,DX,CX,SI,AX) \
+ PRECALC_1(0x80) \
+ CALC_F1_POST(DX,BX,AX)
+
+#define CALC_2 \
+ CALC_F1_PRE(0x8,AX,DX,BX,DI) \
+ PRECALC_2(Y15) \
+ CALC_F1_POST(AX,CX,DI)
+
+#define CALC_3 \
+ CALC_F1_PRE(0xc,DI,AX,CX,SI) \
+ CALC_F1_POST(DI,DX,SI)
+
+#define CALC_4 \
+ CALC_F1_PRE(0x20,SI,DI,DX,BX) \
+ PRECALC_4(Y15,0x0) \
+ CALC_F1_POST(SI,AX,BX)
+
+#define CALC_5 \
+ CALC_F1_PRE(0x24,BX,SI,AX,CX) \
+ CALC_F1_POST(BX,DI,CX)
+
+#define CALC_6 \
+ CALC_F1_PRE(0x28,CX,BX,DI,DX) \
+ CALC_F1_POST(CX,SI,DX)
+
+#define CALC_7 \
+ CALC_F1_PRE(0x2c,DX,CX,SI,AX) \
+ PRECALC_7(0x0) \
+ CALC_F1_POST(DX,BX,AX)
+
+#define CALC_8 \
+ CALC_F1_PRE(0x40,AX,DX,BX,DI) \
+ PRECALC_0(0x90) \
+ CALC_F1_POST(AX,CX,DI)
+
+#define CALC_9 \
+ CALC_F1_PRE(0x44,DI,AX,CX,SI) \
+ PRECALC_1(0x90) \
+ CALC_F1_POST(DI,DX,SI)
+
+#define CALC_10 \
+ CALC_F1_PRE(0x48,SI,DI,DX,BX) \
+ PRECALC_2(Y14) \
+ CALC_F1_POST(SI,AX,BX)
+
+#define CALC_11 \
+ CALC_F1_PRE(0x4c,BX,SI,AX,CX) \
+ CALC_F1_POST(BX,DI,CX)
+
+#define CALC_12 \
+ CALC_F1_PRE(0x60,CX,BX,DI,DX) \
+ PRECALC_4(Y14,0x0) \
+ CALC_F1_POST(CX,SI,DX)
+
+#define CALC_13 \
+ CALC_F1_PRE(0x64,DX,CX,SI,AX) \
+ CALC_F1_POST(DX,BX,AX)
+
+#define CALC_14 \
+ CALC_F1_PRE(0x68,AX,DX,BX,DI) \
+ CALC_F1_POST(AX,CX,DI)
+
+#define CALC_15 \
+ CALC_F1_PRE(0x6c,DI,AX,CX,SI) \
+ PRECALC_7(0x10) \
+ CALC_F1_POST(DI,DX,SI)
+
+#define CALC_16 \
+ CALC_F1_PRE(0x80,SI,DI,DX,BX) \
+ PRECALC_0(0xa0) \
+ CALC_F1_POST(SI,AX,BX)
+
+#define CALC_17 \
+ CALC_F1_PRE(0x84,BX,SI,AX,CX) \
+ PRECALC_1(0xa0) \
+ CALC_F1_POST(BX,DI,CX)
+
+#define CALC_18 \
+ CALC_F1_PRE(0x88,CX,BX,DI,DX) \
+ PRECALC_2(Y13) \
+ CALC_F1_POST(CX,SI,DX)
+
+
+#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \
+ ADDL OFFSET(R15),REG_E \
+ LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round
+ RORXL $0x1b, REG_A, R12 \
+ RORXL $2, REG_A, REG_B // for next round
+
+#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \
+ XORL REG_B, REG_A \
+ ADDL R12, REG_E \
+ XORL REG_C, REG_A
+
+#define CALC_19 \
+ CALC_F2_PRE(0x8c,DX,CX,AX) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_20 \
+ CALC_F2_PRE(0xa0,AX,DX,DI) \
+ PRECALC_4(Y13,0x0) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_21 \
+ CALC_F2_PRE(0xa4,DI,AX,SI) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_22 \
+ CALC_F2_PRE(0xa8,SI,DI,BX) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_23 \
+ CALC_F2_PRE(0xac,BX,SI,CX) \
+ PRECALC_7(0x20) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_24 \
+ CALC_F2_PRE(0xc0,CX,BX,DX) \
+ PRECALC_0(0xb0) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_25 \
+ CALC_F2_PRE(0xc4,DX,CX,AX) \
+ PRECALC_1(0xb0) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_26 \
+ CALC_F2_PRE(0xc8,AX,DX,DI) \
+ PRECALC_2(Y12) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_27 \
+ CALC_F2_PRE(0xcc,DI,AX,SI) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_28 \
+ CALC_F2_PRE(0xe0,SI,DI,BX) \
+ PRECALC_4(Y12,0x0) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_29 \
+ CALC_F2_PRE(0xe4,BX,SI,CX) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_30 \
+ CALC_F2_PRE(0xe8,CX,BX,DX) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_31 \
+ CALC_F2_PRE(0xec,DX,CX,AX) \
+ PRECALC_7(0x30) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_32 \
+ CALC_F2_PRE(0x100,AX,DX,DI) \
+ PRECALC_16(Y15,Y14,Y12,Y8) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_33 \
+ CALC_F2_PRE(0x104,DI,AX,SI) \
+ PRECALC_17(Y15,Y13,Y8) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_34 \
+ CALC_F2_PRE(0x108,SI,DI,BX) \
+ PRECALC_18(Y8) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_35 \
+ CALC_F2_PRE(0x10c,BX,SI,CX) \
+ PRECALC_19(Y8) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_36 \
+ CALC_F2_PRE(0x120,CX,BX,DX) \
+ PRECALC_20(Y8) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_37 \
+ CALC_F2_PRE(0x124,DX,CX,AX) \
+ PRECALC_21(Y8) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_38 \
+ CALC_F2_PRE(0x128,AX,DX,DI) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+
+#define CALC_F3_PRE(OFFSET,REG_E) \
+ ADDL OFFSET(R15),REG_E
+
+#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \
+ LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round
+ MOVL REG_B, BP \
+ ORL REG_A, BP \
+ RORXL $0x1b, REG_A, R12 \
+ RORXL $2, REG_A, REG_TB \
+ ANDL REG_C, BP \ // Calculate F for the next round
+ ANDL REG_B, REG_A \
+ ORL BP, REG_A \
+ ADDL R12, REG_E
+
+#define CALC_39 \
+ CALC_F3_PRE(0x12c,SI) \
+ PRECALC_23(Y8,0x0,0x80) \
+ CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_40 \
+ CALC_F3_PRE(0x140,BX) \
+ PRECALC_16(Y14,Y13,Y8,Y7) \
+ CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_41 \
+ CALC_F3_PRE(0x144,CX) \
+ PRECALC_17(Y14,Y12,Y7) \
+ CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_42 \
+ CALC_F3_PRE(0x148,DX) \
+ PRECALC_18(Y7) \
+ CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_43 \
+ CALC_F3_PRE(0x14c,AX) \
+ PRECALC_19(Y7) \
+ CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_44 \
+ CALC_F3_PRE(0x160,DI) \
+ PRECALC_20(Y7) \
+ CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_45 \
+ CALC_F3_PRE(0x164,SI) \
+ PRECALC_21(Y7) \
+ CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_46 \
+ CALC_F3_PRE(0x168,BX) \
+ CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_47 \
+ CALC_F3_PRE(0x16c,CX) \
+ VPXOR Y9, Y0, Y7 \
+ VPADDD 0x20(R8), Y7, Y0 \
+ VMOVDQU Y0, 0xa0(R14) \
+ CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_48 \
+ CALC_F3_PRE(0x180,DX) \
+ PRECALC_16(Y13,Y12,Y7,Y5) \
+ CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_49 \
+ CALC_F3_PRE(0x184,AX) \
+ PRECALC_17(Y13,Y8,Y5) \
+ CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_50 \
+ CALC_F3_PRE(0x188,DI) \
+ PRECALC_18(Y5) \
+ CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_51 \
+ CALC_F3_PRE(0x18c,SI) \
+ PRECALC_19(Y5) \
+ CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_52 \
+ CALC_F3_PRE(0x1a0,BX) \
+ PRECALC_20(Y5) \
+ CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_53 \
+ CALC_F3_PRE(0x1a4,CX) \
+ PRECALC_21(Y5) \
+ CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_54 \
+ CALC_F3_PRE(0x1a8,DX) \
+ CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_55 \
+ CALC_F3_PRE(0x1ac,AX) \
+ PRECALC_23(Y5,0x20,0xc0) \
+ CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_56 \
+ CALC_F3_PRE(0x1c0,DI) \
+ PRECALC_16(Y12,Y8,Y5,Y3) \
+ CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_57 \
+ CALC_F3_PRE(0x1c4,SI) \
+ PRECALC_17(Y12,Y7,Y3) \
+ CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_58 \
+ CALC_F3_PRE(0x1c8,BX) \
+ PRECALC_18(Y3) \
+ CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_59 \
+ CALC_F2_PRE(0x1cc,BX,SI,CX) \
+ PRECALC_19(Y3) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_60 \
+ CALC_F2_PRE(0x1e0,CX,BX,DX) \
+ PRECALC_20(Y3) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_61 \
+ CALC_F2_PRE(0x1e4,DX,CX,AX) \
+ PRECALC_21(Y3) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_62 \
+ CALC_F2_PRE(0x1e8,AX,DX,DI) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_63 \
+ CALC_F2_PRE(0x1ec,DI,AX,SI) \
+ PRECALC_23(Y3,0x20,0xe0) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_64 \
+ CALC_F2_PRE(0x200,SI,DI,BX) \
+ PRECALC_32(Y5,Y3) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_65 \
+ CALC_F2_PRE(0x204,BX,SI,CX) \
+ PRECALC_33(Y14,Y15) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_66 \
+ CALC_F2_PRE(0x208,CX,BX,DX) \
+ PRECALC_34(Y8) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_67 \
+ CALC_F2_PRE(0x20c,DX,CX,AX) \
+ PRECALC_35(Y15) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_68 \
+ CALC_F2_PRE(0x220,AX,DX,DI) \
+ PRECALC_36(Y15) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_69 \
+ CALC_F2_PRE(0x224,DI,AX,SI) \
+ PRECALC_37(Y15) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_70 \
+ CALC_F2_PRE(0x228,SI,DI,BX) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_71 \
+ CALC_F2_PRE(0x22c,BX,SI,CX) \
+ PRECALC_39(Y15,0x20,0x100) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_72 \
+ CALC_F2_PRE(0x240,CX,BX,DX) \
+ PRECALC_32(Y3,Y15) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_73 \
+ CALC_F2_PRE(0x244,DX,CX,AX) \
+ PRECALC_33(Y13,Y14) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_74 \
+ CALC_F2_PRE(0x248,AX,DX,DI) \
+ PRECALC_34(Y7) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_75 \
+ CALC_F2_PRE(0x24c,DI,AX,SI) \
+ PRECALC_35(Y14) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_76 \
+ CALC_F2_PRE(0x260,SI,DI,BX) \
+ PRECALC_36(Y14) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_77 \
+ CALC_F2_PRE(0x264,BX,SI,CX) \
+ PRECALC_37(Y14) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_78 \
+ CALC_F2_PRE(0x268,CX,BX,DX) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_79 \
+ ADDL 0x26c(R15), AX \
+ LEAL (AX)(CX*1), AX \
+ RORXL $0x1b, DX, R12 \
+ PRECALC_39(Y14,0x20,0x120) \
+ ADDL R12, AX
+
+// Similar to CALC_0
+#define CALC_80 \
+ MOVL CX, DX \
+ RORXL $2, CX, CX \
+ ANDNL SI, DX, BP \
+ ANDL BX, DX \
+ XORL BP, DX \
+ CALC_F1_PRE(0x10,AX,DX,BX,DI) \
+ PRECALC_32(Y15,Y14) \
+ CALC_F1_POST(AX,CX,DI)
+
+#define CALC_81 \
+ CALC_F1_PRE(0x14,DI,AX,CX,SI) \
+ PRECALC_33(Y12,Y13) \
+ CALC_F1_POST(DI,DX,SI)
+
+#define CALC_82 \
+ CALC_F1_PRE(0x18,SI,DI,DX,BX) \
+ PRECALC_34(Y5) \
+ CALC_F1_POST(SI,AX,BX)
+
+#define CALC_83 \
+ CALC_F1_PRE(0x1c,BX,SI,AX,CX) \
+ PRECALC_35(Y13) \
+ CALC_F1_POST(BX,DI,CX)
+
+#define CALC_84 \
+ CALC_F1_PRE(0x30,CX,BX,DI,DX) \
+ PRECALC_36(Y13) \
+ CALC_F1_POST(CX,SI,DX)
+
+#define CALC_85 \
+ CALC_F1_PRE(0x34,DX,CX,SI,AX) \
+ PRECALC_37(Y13) \
+ CALC_F1_POST(DX,BX,AX)
+
+#define CALC_86 \
+ CALC_F1_PRE(0x38,AX,DX,BX,DI) \
+ CALC_F1_POST(AX,CX,DI)
+
+#define CALC_87 \
+ CALC_F1_PRE(0x3c,DI,AX,CX,SI) \
+ PRECALC_39(Y13,0x40,0x140) \
+ CALC_F1_POST(DI,DX,SI)
+
+#define CALC_88 \
+ CALC_F1_PRE(0x50,SI,DI,DX,BX) \
+ PRECALC_32(Y14,Y13) \
+ CALC_F1_POST(SI,AX,BX)
+
+#define CALC_89 \
+ CALC_F1_PRE(0x54,BX,SI,AX,CX) \
+ PRECALC_33(Y8,Y12) \
+ CALC_F1_POST(BX,DI,CX)
+
+#define CALC_90 \
+ CALC_F1_PRE(0x58,CX,BX,DI,DX) \
+ PRECALC_34(Y3) \
+ CALC_F1_POST(CX,SI,DX)
+
+#define CALC_91 \
+ CALC_F1_PRE(0x5c,DX,CX,SI,AX) \
+ PRECALC_35(Y12) \
+ CALC_F1_POST(DX,BX,AX)
+
+#define CALC_92 \
+ CALC_F1_PRE(0x70,AX,DX,BX,DI) \
+ PRECALC_36(Y12) \
+ CALC_F1_POST(AX,CX,DI)
+
+#define CALC_93 \
+ CALC_F1_PRE(0x74,DI,AX,CX,SI) \
+ PRECALC_37(Y12) \
+ CALC_F1_POST(DI,DX,SI)
+
+#define CALC_94 \
+ CALC_F1_PRE(0x78,SI,DI,DX,BX) \
+ CALC_F1_POST(SI,AX,BX)
+
+#define CALC_95 \
+ CALC_F1_PRE(0x7c,BX,SI,AX,CX) \
+ PRECALC_39(Y12,0x40,0x160) \
+ CALC_F1_POST(BX,DI,CX)
+
+#define CALC_96 \
+ CALC_F1_PRE(0x90,CX,BX,DI,DX) \
+ PRECALC_32(Y13,Y12) \
+ CALC_F1_POST(CX,SI,DX)
+
+#define CALC_97 \
+ CALC_F1_PRE(0x94,DX,CX,SI,AX) \
+ PRECALC_33(Y7,Y8) \
+ CALC_F1_POST(DX,BX,AX)
+
+#define CALC_98 \
+ CALC_F1_PRE(0x98,AX,DX,BX,DI) \
+ PRECALC_34(Y15) \
+ CALC_F1_POST(AX,CX,DI)
+
+#define CALC_99 \
+ CALC_F2_PRE(0x9c,DI,AX,SI) \
+ PRECALC_35(Y8) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_100 \
+ CALC_F2_PRE(0xb0,SI,DI,BX) \
+ PRECALC_36(Y8) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_101 \
+ CALC_F2_PRE(0xb4,BX,SI,CX) \
+ PRECALC_37(Y8) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_102 \
+ CALC_F2_PRE(0xb8,CX,BX,DX) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_103 \
+ CALC_F2_PRE(0xbc,DX,CX,AX) \
+ PRECALC_39(Y8,0x40,0x180) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_104 \
+ CALC_F2_PRE(0xd0,AX,DX,DI) \
+ PRECALC_32(Y12,Y8) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_105 \
+ CALC_F2_PRE(0xd4,DI,AX,SI) \
+ PRECALC_33(Y5,Y7) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_106 \
+ CALC_F2_PRE(0xd8,SI,DI,BX) \
+ PRECALC_34(Y14) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_107 \
+ CALC_F2_PRE(0xdc,BX,SI,CX) \
+ PRECALC_35(Y7) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_108 \
+ CALC_F2_PRE(0xf0,CX,BX,DX) \
+ PRECALC_36(Y7) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_109 \
+ CALC_F2_PRE(0xf4,DX,CX,AX) \
+ PRECALC_37(Y7) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_110 \
+ CALC_F2_PRE(0xf8,AX,DX,DI) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_111 \
+ CALC_F2_PRE(0xfc,DI,AX,SI) \
+ PRECALC_39(Y7,0x40,0x1a0) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_112 \
+ CALC_F2_PRE(0x110,SI,DI,BX) \
+ PRECALC_32(Y8,Y7) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_113 \
+ CALC_F2_PRE(0x114,BX,SI,CX) \
+ PRECALC_33(Y3,Y5) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_114 \
+ CALC_F2_PRE(0x118,CX,BX,DX) \
+ PRECALC_34(Y13) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_115 \
+ CALC_F2_PRE(0x11c,DX,CX,AX) \
+ PRECALC_35(Y5) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_116 \
+ CALC_F2_PRE(0x130,AX,DX,DI) \
+ PRECALC_36(Y5) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_117 \
+ CALC_F2_PRE(0x134,DI,AX,SI) \
+ PRECALC_37(Y5) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_118 \
+ CALC_F2_PRE(0x138,SI,DI,BX) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_119 \
+ CALC_F3_PRE(0x13c,CX) \
+ PRECALC_39(Y5,0x40,0x1c0) \
+ CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_120 \
+ CALC_F3_PRE(0x150,DX) \
+ PRECALC_32(Y7,Y5) \
+ CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_121 \
+ CALC_F3_PRE(0x154,AX) \
+ PRECALC_33(Y15,Y3) \
+ CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_122 \
+ CALC_F3_PRE(0x158,DI) \
+ PRECALC_34(Y12) \
+ CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_123 \
+ CALC_F3_PRE(0x15c,SI) \
+ PRECALC_35(Y3) \
+ CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_124 \
+ CALC_F3_PRE(0x170,BX) \
+ PRECALC_36(Y3) \
+ CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_125 \
+ CALC_F3_PRE(0x174,CX) \
+ PRECALC_37(Y3) \
+ CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_126 \
+ CALC_F3_PRE(0x178,DX) \
+ CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_127 \
+ CALC_F3_PRE(0x17c,AX) \
+ PRECALC_39(Y3,0x60,0x1e0) \
+ CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_128 \
+ CALC_F3_PRE(0x190,DI) \
+ PRECALC_32(Y5,Y3) \
+ CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_129 \
+ CALC_F3_PRE(0x194,SI) \
+ PRECALC_33(Y14,Y15) \
+ CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_130 \
+ CALC_F3_PRE(0x198,BX) \
+ PRECALC_34(Y8) \
+ CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_131 \
+ CALC_F3_PRE(0x19c,CX) \
+ PRECALC_35(Y15) \
+ CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_132 \
+ CALC_F3_PRE(0x1b0,DX) \
+ PRECALC_36(Y15) \
+ CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_133 \
+ CALC_F3_PRE(0x1b4,AX) \
+ PRECALC_37(Y15) \
+ CALC_F3_POST(DX,BX,SI,AX,CX)
+
+#define CALC_134 \
+ CALC_F3_PRE(0x1b8,DI) \
+ CALC_F3_POST(AX,CX,BX,DI,DX)
+
+#define CALC_135 \
+ CALC_F3_PRE(0x1bc,SI) \
+ PRECALC_39(Y15,0x60,0x200) \
+ CALC_F3_POST(DI,DX,CX,SI,AX)
+
+#define CALC_136 \
+ CALC_F3_PRE(0x1d0,BX) \
+ PRECALC_32(Y3,Y15) \
+ CALC_F3_POST(SI,AX,DX,BX,DI)
+
+#define CALC_137 \
+ CALC_F3_PRE(0x1d4,CX) \
+ PRECALC_33(Y13,Y14) \
+ CALC_F3_POST(BX,DI,AX,CX,SI)
+
+#define CALC_138 \
+ CALC_F3_PRE(0x1d8,DX) \
+ PRECALC_34(Y7) \
+ CALC_F3_POST(CX,SI,DI,DX,BX)
+
+#define CALC_139 \
+ CALC_F2_PRE(0x1dc,DX,CX,AX) \
+ PRECALC_35(Y14) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_140 \
+ CALC_F2_PRE(0x1f0,AX,DX,DI) \
+ PRECALC_36(Y14) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_141 \
+ CALC_F2_PRE(0x1f4,DI,AX,SI) \
+ PRECALC_37(Y14) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_142 \
+ CALC_F2_PRE(0x1f8,SI,DI,BX) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_143 \
+ CALC_F2_PRE(0x1fc,BX,SI,CX) \
+ PRECALC_39(Y14,0x60,0x220) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_144 \
+ CALC_F2_PRE(0x210,CX,BX,DX) \
+ PRECALC_32(Y15,Y14) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_145 \
+ CALC_F2_PRE(0x214,DX,CX,AX) \
+ PRECALC_33(Y12,Y13) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_146 \
+ CALC_F2_PRE(0x218,AX,DX,DI) \
+ PRECALC_34(Y5) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_147 \
+ CALC_F2_PRE(0x21c,DI,AX,SI) \
+ PRECALC_35(Y13) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_148 \
+ CALC_F2_PRE(0x230,SI,DI,BX) \
+ PRECALC_36(Y13) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_149 \
+ CALC_F2_PRE(0x234,BX,SI,CX) \
+ PRECALC_37(Y13) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_150 \
+ CALC_F2_PRE(0x238,CX,BX,DX) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_151 \
+ CALC_F2_PRE(0x23c,DX,CX,AX) \
+ PRECALC_39(Y13,0x60,0x240) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_152 \
+ CALC_F2_PRE(0x250,AX,DX,DI) \
+ PRECALC_32(Y14,Y13) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_153 \
+ CALC_F2_PRE(0x254,DI,AX,SI) \
+ PRECALC_33(Y8,Y12) \
+ CALC_F2_POST(DI,DX,CX,SI)
+
+#define CALC_154 \
+ CALC_F2_PRE(0x258,SI,DI,BX) \
+ PRECALC_34(Y3) \
+ CALC_F2_POST(SI,AX,DX,BX)
+
+#define CALC_155 \
+ CALC_F2_PRE(0x25c,BX,SI,CX) \
+ PRECALC_35(Y12) \
+ CALC_F2_POST(BX,DI,AX,CX)
+
+#define CALC_156 \
+ CALC_F2_PRE(0x270,CX,BX,DX) \
+ PRECALC_36(Y12) \
+ CALC_F2_POST(CX,SI,DI,DX)
+
+#define CALC_157 \
+ CALC_F2_PRE(0x274,DX,CX,AX) \
+ PRECALC_37(Y12) \
+ CALC_F2_POST(DX,BX,SI,AX)
+
+#define CALC_158 \
+ CALC_F2_PRE(0x278,AX,DX,DI) \
+ CALC_F2_POST(AX,CX,BX,DI)
+
+#define CALC_159 \
+ ADDL 0x27c(R15),SI \
+ LEAL (SI)(AX*1), SI \
+ RORXL $0x1b, DI, R12 \
+ PRECALC_39(Y12,0x60,0x260) \
+ ADDL R12, SI
+
+
+
+#define CALC \
+ MOVL (R9), CX \
+ MOVL 4(R9), SI \
+ MOVL 8(R9), DI \
+ MOVL 12(R9), AX \
+ MOVL 16(R9), DX \
+ MOVQ SP, R14 \
+ LEAQ (2*4*80+32)(SP), R15 \
+ PRECALC \ // Precalc WK for first 2 blocks
+ XCHGQ R15, R14 \
+loop: \ // this loops is unrolled
+ CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block
+ JNE begin \
+ VZEROUPPER \
+ RET \
+begin: \
+ CALC_0 \
+ CALC_1 \
+ CALC_2 \
+ CALC_3 \
+ CALC_4 \
+ CALC_5 \
+ CALC_6 \
+ CALC_7 \
+ CALC_8 \
+ CALC_9 \
+ CALC_10 \
+ CALC_11 \
+ CALC_12 \
+ CALC_13 \
+ CALC_14 \
+ CALC_15 \
+ CALC_16 \
+ CALC_17 \
+ CALC_18 \
+ CALC_19 \
+ CALC_20 \
+ CALC_21 \
+ CALC_22 \
+ CALC_23 \
+ CALC_24 \
+ CALC_25 \
+ CALC_26 \
+ CALC_27 \
+ CALC_28 \
+ CALC_29 \
+ CALC_30 \
+ CALC_31 \
+ CALC_32 \
+ CALC_33 \
+ CALC_34 \
+ CALC_35 \
+ CALC_36 \
+ CALC_37 \
+ CALC_38 \
+ CALC_39 \
+ CALC_40 \
+ CALC_41 \
+ CALC_42 \
+ CALC_43 \
+ CALC_44 \
+ CALC_45 \
+ CALC_46 \
+ CALC_47 \
+ CALC_48 \
+ CALC_49 \
+ CALC_50 \
+ CALC_51 \
+ CALC_52 \
+ CALC_53 \
+ CALC_54 \
+ CALC_55 \
+ CALC_56 \
+ CALC_57 \
+ CALC_58 \
+ CALC_59 \
+ ADDQ $128, R10 \ // move to next even-64-byte block
+ CMPQ R10, R11 \ // is current block the last one?
+ CMOVQCC R8, R10 \ // signal the last iteration smartly
+ CALC_60 \
+ CALC_61 \
+ CALC_62 \
+ CALC_63 \
+ CALC_64 \
+ CALC_65 \
+ CALC_66 \
+ CALC_67 \
+ CALC_68 \
+ CALC_69 \
+ CALC_70 \
+ CALC_71 \
+ CALC_72 \
+ CALC_73 \
+ CALC_74 \
+ CALC_75 \
+ CALC_76 \
+ CALC_77 \
+ CALC_78 \
+ CALC_79 \
+ UPDATE_HASH(AX,DX,BX,SI,DI) \
+ CMPQ R10, R8 \ // is current block the last one?
+ JE loop\
+ MOVL DX, CX \
+ CALC_80 \
+ CALC_81 \
+ CALC_82 \
+ CALC_83 \
+ CALC_84 \
+ CALC_85 \
+ CALC_86 \
+ CALC_87 \
+ CALC_88 \
+ CALC_89 \
+ CALC_90 \
+ CALC_91 \
+ CALC_92 \
+ CALC_93 \
+ CALC_94 \
+ CALC_95 \
+ CALC_96 \
+ CALC_97 \
+ CALC_98 \
+ CALC_99 \
+ CALC_100 \
+ CALC_101 \
+ CALC_102 \
+ CALC_103 \
+ CALC_104 \
+ CALC_105 \
+ CALC_106 \
+ CALC_107 \
+ CALC_108 \
+ CALC_109 \
+ CALC_110 \
+ CALC_111 \
+ CALC_112 \
+ CALC_113 \
+ CALC_114 \
+ CALC_115 \
+ CALC_116 \
+ CALC_117 \
+ CALC_118 \
+ CALC_119 \
+ CALC_120 \
+ CALC_121 \
+ CALC_122 \
+ CALC_123 \
+ CALC_124 \
+ CALC_125 \
+ CALC_126 \
+ CALC_127 \
+ CALC_128 \
+ CALC_129 \
+ CALC_130 \
+ CALC_131 \
+ CALC_132 \
+ CALC_133 \
+ CALC_134 \
+ CALC_135 \
+ CALC_136 \
+ CALC_137 \
+ CALC_138 \
+ CALC_139 \
+ ADDQ $128, R13 \ //move to next even-64-byte block
+ CMPQ R13, R11 \ //is current block the last one?
+ CMOVQCC R8, R10 \
+ CALC_140 \
+ CALC_141 \
+ CALC_142 \
+ CALC_143 \
+ CALC_144 \
+ CALC_145 \
+ CALC_146 \
+ CALC_147 \
+ CALC_148 \
+ CALC_149 \
+ CALC_150 \
+ CALC_151 \
+ CALC_152 \
+ CALC_153 \
+ CALC_154 \
+ CALC_155 \
+ CALC_156 \
+ CALC_157 \
+ CALC_158 \
+ CALC_159 \
+ UPDATE_HASH(SI,DI,DX,CX,BX) \
+ MOVL SI, R12 \ //Reset state for AVX2 reg permutation
+ MOVL DI, SI \
+ MOVL DX, DI \
+ MOVL BX, DX \
+ MOVL CX, AX \
+ MOVL R12, CX \
+ XCHGQ R15, R14 \
+ JMP loop
+
+
+
+TEXT ·blockAVX2(SB),$1408-32
+
+ MOVQ dig+0(FP), DI
+ MOVQ p_base+8(FP), SI
+ MOVQ p_len+16(FP), DX
+ SHRQ $6, DX
+ SHLQ $6, DX
+
+ MOVQ $K_XMM_AR<>(SB), R8
+
+ MOVQ DI, R9
+ MOVQ SI, R10
+ LEAQ 64(SI), R13
+
+ ADDQ SI, DX
+ ADDQ $64, DX
+ MOVQ DX, R11
+
+ CMPQ R13, R11
+ CMOVQCC R8, R13
+
+ VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10
+
+ CALC // RET is inside macros
+
+DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999
+DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1
+DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc
+DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6
+DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6
+GLOBL K_XMM_AR<>(SB),RODATA,$128
+
+DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203
+DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607
+DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b
+DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f
+DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203
+DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607
+DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b
+DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f
+GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32
diff --git a/src/crypto/sha1/sha1block_arm.s b/src/crypto/sha1/sha1block_arm.s
new file mode 100644
index 0000000..2236533
--- /dev/null
+++ b/src/crypto/sha1/sha1block_arm.s
@@ -0,0 +1,217 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// ARM version of md5block.go
+
+#include "textflag.h"
+
+// SHA-1 block routine. See sha1block.go for Go equivalent.
+//
+// There are 80 rounds of 4 types:
+// - rounds 0-15 are type 1 and load data (ROUND1 macro).
+// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
+// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
+// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
+// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
+//
+// Each round loads or shuffles the data, then computes a per-round
+// function of b, c, d, and then mixes the result into and rotates the
+// five registers a, b, c, d, e holding the intermediate results.
+//
+// The register rotation is implemented by rotating the arguments to
+// the round macros instead of by explicit move instructions.
+
+// Register definitions
+#define Rdata R0 // Pointer to incoming data
+#define Rconst R1 // Current constant for SHA round
+#define Ra R2 // SHA-1 accumulator
+#define Rb R3 // SHA-1 accumulator
+#define Rc R4 // SHA-1 accumulator
+#define Rd R5 // SHA-1 accumulator
+#define Re R6 // SHA-1 accumulator
+#define Rt0 R7 // Temporary
+#define Rt1 R8 // Temporary
+// r9, r10 are forbidden
+// r11 is OK provided you check the assembler that no synthetic instructions use it
+#define Rt2 R11 // Temporary
+#define Rctr R12 // loop counter
+#define Rw R14 // point to w buffer
+
+// func block(dig *digest, p []byte)
+// 0(FP) is *digest
+// 4(FP) is p.array (struct Slice)
+// 8(FP) is p.len
+//12(FP) is p.cap
+//
+// Stack frame
+#define p_end end-4(SP) // pointer to the end of data
+#define p_data data-8(SP) // current data pointer (unused?)
+#define w_buf buf-(8+4*80)(SP) //80 words temporary buffer w uint32[80]
+#define saved abcde-(8+4*80+4*5)(SP) // saved sha1 registers a,b,c,d,e - these must be last (unused?)
+// Total size +4 for saved LR is 352
+
+ // w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3]
+ // e += w[i]
+#define LOAD(Re) \
+ MOVBU 2(Rdata), Rt0 ; \
+ MOVBU 3(Rdata), Rt1 ; \
+ MOVBU 1(Rdata), Rt2 ; \
+ ORR Rt0<<8, Rt1, Rt0 ; \
+ MOVBU.P 4(Rdata), Rt1 ; \
+ ORR Rt2<<16, Rt0, Rt0 ; \
+ ORR Rt1<<24, Rt0, Rt0 ; \
+ MOVW.P Rt0, 4(Rw) ; \
+ ADD Rt0, Re, Re
+
+ // tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
+ // w[i&0xf] = tmp<<1 | tmp>>(32-1)
+ // e += w[i&0xf]
+#define SHUFFLE(Re) \
+ MOVW (-16*4)(Rw), Rt0 ; \
+ MOVW (-14*4)(Rw), Rt1 ; \
+ MOVW (-8*4)(Rw), Rt2 ; \
+ EOR Rt0, Rt1, Rt0 ; \
+ MOVW (-3*4)(Rw), Rt1 ; \
+ EOR Rt2, Rt0, Rt0 ; \
+ EOR Rt0, Rt1, Rt0 ; \
+ MOVW Rt0@>(32-1), Rt0 ; \
+ MOVW.P Rt0, 4(Rw) ; \
+ ADD Rt0, Re, Re
+
+ // t1 = (b & c) | ((~b) & d)
+#define FUNC1(Ra, Rb, Rc, Rd, Re) \
+ MVN Rb, Rt1 ; \
+ AND Rb, Rc, Rt0 ; \
+ AND Rd, Rt1, Rt1 ; \
+ ORR Rt0, Rt1, Rt1
+
+ // t1 = b ^ c ^ d
+#define FUNC2(Ra, Rb, Rc, Rd, Re) \
+ EOR Rb, Rc, Rt1 ; \
+ EOR Rd, Rt1, Rt1
+
+ // t1 = (b & c) | (b & d) | (c & d) =
+ // t1 = (b & c) | ((b | c) & d)
+#define FUNC3(Ra, Rb, Rc, Rd, Re) \
+ ORR Rb, Rc, Rt0 ; \
+ AND Rb, Rc, Rt1 ; \
+ AND Rd, Rt0, Rt0 ; \
+ ORR Rt0, Rt1, Rt1
+
+#define FUNC4 FUNC2
+
+ // a5 := a<<5 | a>>(32-5)
+ // b = b<<30 | b>>(32-30)
+ // e = a5 + t1 + e + const
+#define MIX(Ra, Rb, Rc, Rd, Re) \
+ ADD Rt1, Re, Re ; \
+ MOVW Rb@>(32-30), Rb ; \
+ ADD Ra@>(32-5), Re, Re ; \
+ ADD Rconst, Re, Re
+
+#define ROUND1(Ra, Rb, Rc, Rd, Re) \
+ LOAD(Re) ; \
+ FUNC1(Ra, Rb, Rc, Rd, Re) ; \
+ MIX(Ra, Rb, Rc, Rd, Re)
+
+#define ROUND1x(Ra, Rb, Rc, Rd, Re) \
+ SHUFFLE(Re) ; \
+ FUNC1(Ra, Rb, Rc, Rd, Re) ; \
+ MIX(Ra, Rb, Rc, Rd, Re)
+
+#define ROUND2(Ra, Rb, Rc, Rd, Re) \
+ SHUFFLE(Re) ; \
+ FUNC2(Ra, Rb, Rc, Rd, Re) ; \
+ MIX(Ra, Rb, Rc, Rd, Re)
+
+#define ROUND3(Ra, Rb, Rc, Rd, Re) \
+ SHUFFLE(Re) ; \
+ FUNC3(Ra, Rb, Rc, Rd, Re) ; \
+ MIX(Ra, Rb, Rc, Rd, Re)
+
+#define ROUND4(Ra, Rb, Rc, Rd, Re) \
+ SHUFFLE(Re) ; \
+ FUNC4(Ra, Rb, Rc, Rd, Re) ; \
+ MIX(Ra, Rb, Rc, Rd, Re)
+
+
+// func block(dig *digest, p []byte)
+TEXT ·block(SB), 0, $352-16
+ MOVW p+4(FP), Rdata // pointer to the data
+ MOVW p_len+8(FP), Rt0 // number of bytes
+ ADD Rdata, Rt0
+ MOVW Rt0, p_end // pointer to end of data
+
+ // Load up initial SHA-1 accumulator
+ MOVW dig+0(FP), Rt0
+ MOVM.IA (Rt0), [Ra,Rb,Rc,Rd,Re]
+
+loop:
+ // Save registers at SP+4 onwards
+ MOVM.IB [Ra,Rb,Rc,Rd,Re], (R13)
+
+ MOVW $w_buf, Rw
+ MOVW $0x5A827999, Rconst
+ MOVW $3, Rctr
+loop1: ROUND1(Ra, Rb, Rc, Rd, Re)
+ ROUND1(Re, Ra, Rb, Rc, Rd)
+ ROUND1(Rd, Re, Ra, Rb, Rc)
+ ROUND1(Rc, Rd, Re, Ra, Rb)
+ ROUND1(Rb, Rc, Rd, Re, Ra)
+ SUB.S $1, Rctr
+ BNE loop1
+
+ ROUND1(Ra, Rb, Rc, Rd, Re)
+ ROUND1x(Re, Ra, Rb, Rc, Rd)
+ ROUND1x(Rd, Re, Ra, Rb, Rc)
+ ROUND1x(Rc, Rd, Re, Ra, Rb)
+ ROUND1x(Rb, Rc, Rd, Re, Ra)
+
+ MOVW $0x6ED9EBA1, Rconst
+ MOVW $4, Rctr
+loop2: ROUND2(Ra, Rb, Rc, Rd, Re)
+ ROUND2(Re, Ra, Rb, Rc, Rd)
+ ROUND2(Rd, Re, Ra, Rb, Rc)
+ ROUND2(Rc, Rd, Re, Ra, Rb)
+ ROUND2(Rb, Rc, Rd, Re, Ra)
+ SUB.S $1, Rctr
+ BNE loop2
+
+ MOVW $0x8F1BBCDC, Rconst
+ MOVW $4, Rctr
+loop3: ROUND3(Ra, Rb, Rc, Rd, Re)
+ ROUND3(Re, Ra, Rb, Rc, Rd)
+ ROUND3(Rd, Re, Ra, Rb, Rc)
+ ROUND3(Rc, Rd, Re, Ra, Rb)
+ ROUND3(Rb, Rc, Rd, Re, Ra)
+ SUB.S $1, Rctr
+ BNE loop3
+
+ MOVW $0xCA62C1D6, Rconst
+ MOVW $4, Rctr
+loop4: ROUND4(Ra, Rb, Rc, Rd, Re)
+ ROUND4(Re, Ra, Rb, Rc, Rd)
+ ROUND4(Rd, Re, Ra, Rb, Rc)
+ ROUND4(Rc, Rd, Re, Ra, Rb)
+ ROUND4(Rb, Rc, Rd, Re, Ra)
+ SUB.S $1, Rctr
+ BNE loop4
+
+ // Accumulate - restoring registers from SP+4
+ MOVM.IB (R13), [Rt0,Rt1,Rt2,Rctr,Rw]
+ ADD Rt0, Ra
+ ADD Rt1, Rb
+ ADD Rt2, Rc
+ ADD Rctr, Rd
+ ADD Rw, Re
+
+ MOVW p_end, Rt0
+ CMP Rt0, Rdata
+ BLO loop
+
+ // Save final SHA-1 accumulator
+ MOVW dig+0(FP), Rt0
+ MOVM.IA [Ra,Rb,Rc,Rd,Re], (Rt0)
+
+ RET
diff --git a/src/crypto/sha1/sha1block_arm64.go b/src/crypto/sha1/sha1block_arm64.go
new file mode 100644
index 0000000..08d3df0
--- /dev/null
+++ b/src/crypto/sha1/sha1block_arm64.go
@@ -0,0 +1,26 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1
+
+import "internal/cpu"
+
+var k = []uint32{
+ 0x5A827999,
+ 0x6ED9EBA1,
+ 0x8F1BBCDC,
+ 0xCA62C1D6,
+}
+
+//go:noescape
+func sha1block(h []uint32, p []byte, k []uint32)
+
+func block(dig *digest, p []byte) {
+ if !cpu.ARM64.HasSHA1 {
+ blockGeneric(dig, p)
+ } else {
+ h := dig.h[:]
+ sha1block(h, p, k)
+ }
+}
diff --git a/src/crypto/sha1/sha1block_arm64.s b/src/crypto/sha1/sha1block_arm64.s
new file mode 100644
index 0000000..d568384
--- /dev/null
+++ b/src/crypto/sha1/sha1block_arm64.s
@@ -0,0 +1,152 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define HASHUPDATECHOOSE \
+ SHA1C V16.S4, V1, V2 \
+ SHA1H V3, V1 \
+ VMOV V2.B16, V3.B16
+
+#define HASHUPDATEPARITY \
+ SHA1P V16.S4, V1, V2 \
+ SHA1H V3, V1 \
+ VMOV V2.B16, V3.B16
+
+#define HASHUPDATEMAJ \
+ SHA1M V16.S4, V1, V2 \
+ SHA1H V3, V1 \
+ VMOV V2.B16, V3.B16
+
+// func sha1block(h []uint32, p []byte, k []uint32)
+TEXT ·sha1block(SB),NOSPLIT,$0
+ MOVD h_base+0(FP), R0 // hash value first address
+ MOVD p_base+24(FP), R1 // message first address
+ MOVD k_base+48(FP), R2 // k constants first address
+ MOVD p_len+32(FP), R3 // message length
+ VLD1.P 16(R0), [V0.S4]
+ FMOVS (R0), F20
+ SUB $16, R0, R0
+
+blockloop:
+
+ VLD1.P 16(R1), [V4.B16] // load message
+ VLD1.P 16(R1), [V5.B16]
+ VLD1.P 16(R1), [V6.B16]
+ VLD1.P 16(R1), [V7.B16]
+ VLD1 (R2), [V19.S4] // load constant k0-k79
+ VMOV V0.B16, V2.B16
+ VMOV V20.S[0], V1
+ VMOV V2.B16, V3.B16
+ VDUP V19.S[0], V17.S4
+ VREV32 V4.B16, V4.B16 // prepare for using message in Byte format
+ VREV32 V5.B16, V5.B16
+ VREV32 V6.B16, V6.B16
+ VREV32 V7.B16, V7.B16
+
+
+ VDUP V19.S[1], V18.S4
+ VADD V17.S4, V4.S4, V16.S4
+ SHA1SU0 V6.S4, V5.S4, V4.S4
+ HASHUPDATECHOOSE
+ SHA1SU1 V7.S4, V4.S4
+
+ VADD V17.S4, V5.S4, V16.S4
+ SHA1SU0 V7.S4, V6.S4, V5.S4
+ HASHUPDATECHOOSE
+ SHA1SU1 V4.S4, V5.S4
+ VADD V17.S4, V6.S4, V16.S4
+ SHA1SU0 V4.S4, V7.S4, V6.S4
+ HASHUPDATECHOOSE
+ SHA1SU1 V5.S4, V6.S4
+
+ VADD V17.S4, V7.S4, V16.S4
+ SHA1SU0 V5.S4, V4.S4, V7.S4
+ HASHUPDATECHOOSE
+ SHA1SU1 V6.S4, V7.S4
+
+ VADD V17.S4, V4.S4, V16.S4
+ SHA1SU0 V6.S4, V5.S4, V4.S4
+ HASHUPDATECHOOSE
+ SHA1SU1 V7.S4, V4.S4
+
+ VDUP V19.S[2], V17.S4
+ VADD V18.S4, V5.S4, V16.S4
+ SHA1SU0 V7.S4, V6.S4, V5.S4
+ HASHUPDATEPARITY
+ SHA1SU1 V4.S4, V5.S4
+
+ VADD V18.S4, V6.S4, V16.S4
+ SHA1SU0 V4.S4, V7.S4, V6.S4
+ HASHUPDATEPARITY
+ SHA1SU1 V5.S4, V6.S4
+
+ VADD V18.S4, V7.S4, V16.S4
+ SHA1SU0 V5.S4, V4.S4, V7.S4
+ HASHUPDATEPARITY
+ SHA1SU1 V6.S4, V7.S4
+
+ VADD V18.S4, V4.S4, V16.S4
+ SHA1SU0 V6.S4, V5.S4, V4.S4
+ HASHUPDATEPARITY
+ SHA1SU1 V7.S4, V4.S4
+
+ VADD V18.S4, V5.S4, V16.S4
+ SHA1SU0 V7.S4, V6.S4, V5.S4
+ HASHUPDATEPARITY
+ SHA1SU1 V4.S4, V5.S4
+
+ VDUP V19.S[3], V18.S4
+ VADD V17.S4, V6.S4, V16.S4
+ SHA1SU0 V4.S4, V7.S4, V6.S4
+ HASHUPDATEMAJ
+ SHA1SU1 V5.S4, V6.S4
+
+ VADD V17.S4, V7.S4, V16.S4
+ SHA1SU0 V5.S4, V4.S4, V7.S4
+ HASHUPDATEMAJ
+ SHA1SU1 V6.S4, V7.S4
+
+ VADD V17.S4, V4.S4, V16.S4
+ SHA1SU0 V6.S4, V5.S4, V4.S4
+ HASHUPDATEMAJ
+ SHA1SU1 V7.S4, V4.S4
+
+ VADD V17.S4, V5.S4, V16.S4
+ SHA1SU0 V7.S4, V6.S4, V5.S4
+ HASHUPDATEMAJ
+ SHA1SU1 V4.S4, V5.S4
+
+ VADD V17.S4, V6.S4, V16.S4
+ SHA1SU0 V4.S4, V7.S4, V6.S4
+ HASHUPDATEMAJ
+ SHA1SU1 V5.S4, V6.S4
+
+ VADD V18.S4, V7.S4, V16.S4
+ SHA1SU0 V5.S4, V4.S4, V7.S4
+ HASHUPDATEPARITY
+ SHA1SU1 V6.S4, V7.S4
+
+ VADD V18.S4, V4.S4, V16.S4
+ HASHUPDATEPARITY
+
+ VADD V18.S4, V5.S4, V16.S4
+ HASHUPDATEPARITY
+
+ VADD V18.S4, V6.S4, V16.S4
+ HASHUPDATEPARITY
+
+ VADD V18.S4, V7.S4, V16.S4
+ HASHUPDATEPARITY
+
+ SUB $64, R3, R3 // message length - 64bytes, then compare with 64bytes
+ VADD V2.S4, V0.S4, V0.S4
+ VADD V1.S4, V20.S4, V20.S4
+ CBNZ R3, blockloop
+
+sha1ret:
+
+ VST1.P [V0.S4], 16(R0) // store hash value H(dcba)
+ FMOVS F20, (R0) // store hash value H(e)
+ RET
diff --git a/src/crypto/sha1/sha1block_decl.go b/src/crypto/sha1/sha1block_decl.go
new file mode 100644
index 0000000..93054ef
--- /dev/null
+++ b/src/crypto/sha1/sha1block_decl.go
@@ -0,0 +1,12 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build arm || 386 || s390x
+// +build arm 386 s390x
+
+package sha1
+
+//go:noescape
+
+func block(dig *digest, p []byte)
diff --git a/src/crypto/sha1/sha1block_generic.go b/src/crypto/sha1/sha1block_generic.go
new file mode 100644
index 0000000..feaba5a
--- /dev/null
+++ b/src/crypto/sha1/sha1block_generic.go
@@ -0,0 +1,10 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 && !386 && !arm && !s390x && !arm64
+// +build !amd64,!386,!arm,!s390x,!arm64
+
+package sha1
+
+var block = blockGeneric
diff --git a/src/crypto/sha1/sha1block_s390x.go b/src/crypto/sha1/sha1block_s390x.go
new file mode 100644
index 0000000..446bf5d
--- /dev/null
+++ b/src/crypto/sha1/sha1block_s390x.go
@@ -0,0 +1,9 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sha1
+
+import "internal/cpu"
+
+var useAsm = cpu.S390X.HasSHA1
diff --git a/src/crypto/sha1/sha1block_s390x.s b/src/crypto/sha1/sha1block_s390x.s
new file mode 100644
index 0000000..6ba6883
--- /dev/null
+++ b/src/crypto/sha1/sha1block_s390x.s
@@ -0,0 +1,20 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func block(dig *digest, p []byte)
+TEXT ·block(SB), NOSPLIT|NOFRAME, $0-32
+ MOVBZ ·useAsm(SB), R4
+ LMG dig+0(FP), R1, R3 // R2 = &p[0], R3 = len(p)
+ MOVBZ $1, R0 // SHA-1 function code
+ CMPBEQ R4, $0, generic
+
+loop:
+ WORD $0xB93E0002 // KIMD R2
+ BVS loop // continue if interrupted
+ RET
+
+generic:
+ BR ·blockGeneric(SB)