From a0b8f38ab54ac451646aa00cd5e91b6c76f22a84 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Thu, 30 May 2024 05:57:19 +0200
Subject: Merging upstream version 1.72.1+dfsg1.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 vendor/sha2/.cargo-checksum.json  |   2 +-
 vendor/sha2/CHANGELOG.md          |   6 +
 vendor/sha2/Cargo.toml            |  41 +++++--
 vendor/sha2/src/lib.rs            |   3 +-
 vendor/sha2/src/sha256/aarch64.rs | 146 ++++++++++++++++++++++-
 vendor/sha2/src/sha512.rs         |   4 +
 vendor/sha2/src/sha512/aarch64.rs | 235 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 422 insertions(+), 15 deletions(-)
 create mode 100644 vendor/sha2/src/sha512/aarch64.rs

(limited to 'vendor/sha2')

diff --git a/vendor/sha2/.cargo-checksum.json b/vendor/sha2/.cargo-checksum.json
index d30274379..2716fafc7 100644
--- a/vendor/sha2/.cargo-checksum.json
+++ b/vendor/sha2/.cargo-checksum.json
@@ -1 +1 @@
-{"files":{"CHANGELOG.md":"604cc546b683e035e1f759479e41401c9035e7da6e07808f2cbd7702a07a5d26","Cargo.toml":"a6172879ad5aa1b7a6593e57db16c3d99c00563076239bf094b0d0f5ed6b30f8","LICENSE-APACHE":"a9040321c3712d8fd0b09cf52b17445de04a23a10165049ae187cd39e5c86be5","LICENSE-MIT":"b4eb00df6e2a4d22518fcaa6a2b4646f249b3a3c9814509b22bd2091f1392ff1","README.md":"b7af562922e4a631657acf264772d2af2b72a08d9bbc5fbcf56d9324f9027708","benches/mod.rs":"c32d9f91a541821ea988c14eee710963e623ef1edf69b02b41a29bc44e04ba95","src/consts.rs":"2f820349fa7cbf9fecc1d4aabbd1a721bb1badc3f32ef9e903826960b6f42523","src/core_api.rs":"73b160d98bfa6737688875ad73da5e3c2c93582604dc313d208200e12fdab676","src/lib.rs":"a286546dab99a51bdb3a5dc4edbd08fb9a57028cb422151f3a97441d113d7425","src/sha256.rs":"cfc2b62a412112e471781a770793f0ba0466594b2e37001334562f3d95f340ce","src/sha256/aarch64.rs":"02dbac483409a853126fec642f964a464e4372f53da2fa4120b29bed204f72b7","src/sha256/soft.rs":"98e765a8e8dfa0af31f2b76570f212e6b3099522bf300e1554cbbd9fd5d02960","src/sha256/x86.rs":"70f1597f2029522b35bfd026df0a8908f086523ab2a80ba3ef35e6231b56353c","src/sha512.rs":"92c4210a627b78505a195722b2f24bac5e6cfdece6292bf184ba8d42e7e2c35f","src/sha512/soft.rs":"0183ad89418b886859d2afa9bf061bc92759ae337c1d26147b4300042e63ef42","src/sha512/x86.rs":"c7dd8bdf3212e1e8c4cc9cc6b380dc0468f79dcfd0f61a445d0d38cead45a03a","tests/data/sha224.blb":"59b185972521af418fd49a079de3d5f5bed74cd76d80473da51cab3faee6c7d0","tests/data/sha256.blb":"bb096934bb7e43e41ce143d211397afca6fcdfe243a39811688ea31aae6f800a","tests/data/sha384.blb":"e8fe66c07ba336fae2c0aa4c87cb768f41bd4ed318ee1a36fbde0a68581946ec","tests/data/sha512.blb":"1cc0e86571f2f4e3bc81438ce7b6c25c118d2d7437355240113f59cbb782c8d6","tests/data/sha512_224.blb":"b02dd46741db1034112e0888d0cdb233a21b9a82c319456f806bbaae49acf440","tests/data/sha512_256.blb":"95195b758e362d92ff0cebebac4cca696512ea5811b635243bc70e29164e5786","tests/mod.rs":"61be596fd9b45a8db345950ff2ed6f87eaf4d239ac156885f36e819da0597644"},"package":"82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0"}
\ No newline at end of file
+{"files":{"CHANGELOG.md":"61b6ae2d035d98d2b5612a5c1c40238f4fa7e7bb3062daacf677abb4f7c60eab","Cargo.toml":"7d0b0694d8fd9d50c060a871ae98f80f392473086ec0723aa2017081c8654dce","LICENSE-APACHE":"a9040321c3712d8fd0b09cf52b17445de04a23a10165049ae187cd39e5c86be5","LICENSE-MIT":"b4eb00df6e2a4d22518fcaa6a2b4646f249b3a3c9814509b22bd2091f1392ff1","README.md":"b7af562922e4a631657acf264772d2af2b72a08d9bbc5fbcf56d9324f9027708","benches/mod.rs":"c32d9f91a541821ea988c14eee710963e623ef1edf69b02b41a29bc44e04ba95","src/consts.rs":"2f820349fa7cbf9fecc1d4aabbd1a721bb1badc3f32ef9e903826960b6f42523","src/core_api.rs":"73b160d98bfa6737688875ad73da5e3c2c93582604dc313d208200e12fdab676","src/lib.rs":"9d0ec0ba86a801bd9b2024f0b84ee322a26c7376a623dd61210e0eb9d6355aa1","src/sha256.rs":"cfc2b62a412112e471781a770793f0ba0466594b2e37001334562f3d95f340ce","src/sha256/aarch64.rs":"18121a25867a575fec8ef64da763693ece4e3e3e84da095254b8471234c6f1f8","src/sha256/soft.rs":"98e765a8e8dfa0af31f2b76570f212e6b3099522bf300e1554cbbd9fd5d02960","src/sha256/x86.rs":"70f1597f2029522b35bfd026df0a8908f086523ab2a80ba3ef35e6231b56353c","src/sha512.rs":"b0c94cf6e1a4a8efb8ccc494da96fdf805b1745fef7614875df5b4db9ee186da","src/sha512/aarch64.rs":"2ed929329a0fa66180e4726d028713a49f99cc223e635078fc1f3252a44981e0","src/sha512/soft.rs":"0183ad89418b886859d2afa9bf061bc92759ae337c1d26147b4300042e63ef42","src/sha512/x86.rs":"c7dd8bdf3212e1e8c4cc9cc6b380dc0468f79dcfd0f61a445d0d38cead45a03a","tests/data/sha224.blb":"59b185972521af418fd49a079de3d5f5bed74cd76d80473da51cab3faee6c7d0","tests/data/sha256.blb":"bb096934bb7e43e41ce143d211397afca6fcdfe243a39811688ea31aae6f800a","tests/data/sha384.blb":"e8fe66c07ba336fae2c0aa4c87cb768f41bd4ed318ee1a36fbde0a68581946ec","tests/data/sha512.blb":"1cc0e86571f2f4e3bc81438ce7b6c25c118d2d7437355240113f59cbb782c8d6","tests/data/sha512_224.blb":"b02dd46741db1034112e0888d0cdb233a21b9a82c319456f806bbaae49acf440","tests/data/sha512_256.blb":"95195b758e362d92ff0cebebac4cca696512ea5811b635243bc70e29164e5786","tests/mod.rs":"61be596fd9b45a8db345950ff2ed6f87eaf4d239ac156885f36e819da0597644"},"package":"479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8"}
\ No newline at end of file
diff --git a/vendor/sha2/CHANGELOG.md b/vendor/sha2/CHANGELOG.md
index 127804644..a552266e5 100644
--- a/vendor/sha2/CHANGELOG.md
+++ b/vendor/sha2/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## 0.10.7 (2023-06-15)
+### Added
+- AArch64 Neon-based backend ([#490])
+
+[#490]: https://github.com/RustCrypto/hashes/pull/490
+
 ## 0.10.6 (2022-09-16)
 ### Added
 - Feature-gated OID support ([#405])
diff --git a/vendor/sha2/Cargo.toml b/vendor/sha2/Cargo.toml
index ff23bdc32..c19f3241d 100644
--- a/vendor/sha2/Cargo.toml
+++ b/vendor/sha2/Cargo.toml
@@ -3,35 +3,51 @@
 # When uploading crates to the registry Cargo will automatically
 # "normalize" Cargo.toml files for maximal compatibility
 # with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
 #
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
 
 [package]
 edition = "2018"
 name = "sha2"
-version = "0.10.6"
+version = "0.10.7"
 authors = ["RustCrypto Developers"]
-description = "Pure Rust implementation of the SHA-2 hash function family\nincluding SHA-224, SHA-256, SHA-384, and SHA-512.\n"
+description = """
+Pure Rust implementation of the SHA-2 hash function family
+including SHA-224, SHA-256, SHA-384, and SHA-512.
+"""
 documentation = "https://docs.rs/sha2"
 readme = "README.md"
-keywords = ["crypto", "sha2", "hash", "digest"]
-categories = ["cryptography", "no-std"]
+keywords = [
+    "crypto",
+    "sha2",
+    "hash",
+    "digest",
+]
+categories = [
+    "cryptography",
+    "no-std",
+]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/RustCrypto/hashes"
+
 [package.metadata.docs.rs]
 all-features = true
-rustdoc-args = ["--cfg", "docsrs"]
+rustdoc-args = [
+    "--cfg",
+    "docsrs",
+]
+
 [dependencies.cfg-if]
 version = "1.0"
 
 [dependencies.digest]
-version = "0.10.4"
+version = "0.10.7"
+
 [dev-dependencies.digest]
-version = "0.10.4"
+version = "0.10.7"
 features = ["dev"]
 
 [dev-dependencies.hex-literal]
@@ -45,6 +61,7 @@ default = ["std"]
 force-soft = []
 oid = ["digest/oid"]
 std = ["digest/std"]
+
 [target."cfg(any(target_arch = \"aarch64\", target_arch = \"x86_64\", target_arch = \"x86\"))".dependencies.cpufeatures]
 version = "0.2"
 
diff --git a/vendor/sha2/src/lib.rs b/vendor/sha2/src/lib.rs
index 9082fc5b8..a3482e84a 100644
--- a/vendor/sha2/src/lib.rs
+++ b/vendor/sha2/src/lib.rs
@@ -6,7 +6,8 @@
 //! Algorithmically, there are only 2 core algorithms: SHA-256 and SHA-512.
 //! All other algorithms are just applications of these with different initial
 //! hash values, and truncated to different digest bit lengths. The first two
-//! algorithms in the list are based on SHA-256, while the last three on SHA-512.
+//! algorithms in the list are based on SHA-256, while the last four are based
+//! on SHA-512.
 //!
 //! # Usage
 //!
diff --git a/vendor/sha2/src/sha256/aarch64.rs b/vendor/sha2/src/sha256/aarch64.rs
index 7eaa2de73..9d220a311 100644
--- a/vendor/sha2/src/sha256/aarch64.rs
+++ b/vendor/sha2/src/sha256/aarch64.rs
@@ -1,15 +1,159 @@
 //! SHA-256 `aarch64` backend.
 
+// Implementation adapted from mbedtls.
+
 // TODO: stdarch intrinsics: RustCrypto/hashes#257
 
+use core::arch::{aarch64::*, asm};
+
+use crate::consts::K32;
+
 cpufeatures::new!(sha2_hwcap, "sha2");
 
 pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
     // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
     // after stabilization
     if sha2_hwcap::get() {
-        sha2_asm::compress256(state, blocks);
+        unsafe { sha256_compress(state, blocks) }
     } else {
         super::soft::compress(state, blocks);
     }
 }
+
+#[target_feature(enable = "sha2")]
+unsafe fn sha256_compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
+    // SAFETY: Requires the sha2 feature.
+
+    // Load state into vectors.
+    let mut abcd = vld1q_u32(state[0..4].as_ptr());
+    let mut efgh = vld1q_u32(state[4..8].as_ptr());
+
+    // Iterate through the message blocks.
+    for block in blocks {
+        // Keep original state values.
+        let abcd_orig = abcd;
+        let efgh_orig = efgh;
+
+        // Load the message block into vectors, assuming little endianness.
+        let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[0..16].as_ptr())));
+        let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[16..32].as_ptr())));
+        let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[32..48].as_ptr())));
+        let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[48..64].as_ptr())));
+
+        // Rounds 0 to 3
+        let mut tmp = vaddq_u32(s0, vld1q_u32(&K32[0]));
+        let mut abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 4 to 7
+        tmp = vaddq_u32(s1, vld1q_u32(&K32[4]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 8 to 11
+        tmp = vaddq_u32(s2, vld1q_u32(&K32[8]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        // Rounds 12 to 15
+        tmp = vaddq_u32(s3, vld1q_u32(&K32[12]));
+        abcd_prev = abcd;
+        abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+        efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+        for t in (16..64).step_by(16) {
+            // Rounds t to t + 3
+            s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
+            tmp = vaddq_u32(s0, vld1q_u32(&K32[t]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 4 to t + 7
+            s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
+            tmp = vaddq_u32(s1, vld1q_u32(&K32[t + 4]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 8 to t + 11
+            s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
+            tmp = vaddq_u32(s2, vld1q_u32(&K32[t + 8]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+
+            // Rounds t + 12 to t + 15
+            s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
+            tmp = vaddq_u32(s3, vld1q_u32(&K32[t + 12]));
+            abcd_prev = abcd;
+            abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
+            efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
+        }
+
+        // Add the block-specific state to the original state.
+        abcd = vaddq_u32(abcd, abcd_orig);
+        efgh = vaddq_u32(efgh, efgh_orig);
+    }
+
+    // Store vectors into state.
+    vst1q_u32(state[0..4].as_mut_ptr(), abcd);
+    vst1q_u32(state[4..8].as_mut_ptr(), efgh);
+}
+
+// TODO remove these polyfills once SHA2 intrinsics land
+
+#[inline(always)]
+unsafe fn vsha256hq_u32(
+    mut hash_efgh: uint32x4_t,
+    hash_abcd: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256H {:q}, {:q}, {:v}.4S",
+        inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    hash_efgh
+}
+
+#[inline(always)]
+unsafe fn vsha256h2q_u32(
+    mut hash_efgh: uint32x4_t,
+    hash_abcd: uint32x4_t,
+    wk: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256H2 {:q}, {:q}, {:v}.4S",
+        inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    hash_efgh
+}
+
+#[inline(always)]
+unsafe fn vsha256su0q_u32(mut w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
+    asm!(
+        "SHA256SU0 {:v}.4S, {:v}.4S",
+        inout(vreg) w0_3, in(vreg) w4_7,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    w0_3
+}
+
+#[inline(always)]
+unsafe fn vsha256su1q_u32(
+    mut tw0_3: uint32x4_t,
+    w8_11: uint32x4_t,
+    w12_15: uint32x4_t,
+) -> uint32x4_t {
+    asm!(
+        "SHA256SU1 {:v}.4S, {:v}.4S, {:v}.4S",
+        inout(vreg) tw0_3, in(vreg) w8_11, in(vreg) w12_15,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    tw0_3
+}
diff --git a/vendor/sha2/src/sha512.rs b/vendor/sha2/src/sha512.rs
index e71fdfa43..af4178c0b 100644
--- a/vendor/sha2/src/sha512.rs
+++ b/vendor/sha2/src/sha512.rs
@@ -15,6 +15,10 @@ cfg_if::cfg_if! {
         }
         mod x86;
         use x86::compress;
+    } else if #[cfg(all(feature = "asm", target_arch = "aarch64"))] {
+        mod soft;
+        mod aarch64;
+        use aarch64::compress;
     } else {
         mod soft;
         use soft::compress;
diff --git a/vendor/sha2/src/sha512/aarch64.rs b/vendor/sha2/src/sha512/aarch64.rs
new file mode 100644
index 000000000..fbf441c21
--- /dev/null
+++ b/vendor/sha2/src/sha512/aarch64.rs
@@ -0,0 +1,235 @@
+// Implementation adapted from mbedtls.
+
+use core::arch::{aarch64::*, asm};
+
+use crate::consts::K64;
+
+cpufeatures::new!(sha3_hwcap, "sha3");
+
+pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+    // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
+    // after stabilization
+    if sha3_hwcap::get() {
+        unsafe { sha512_compress(state, blocks) }
+    } else {
+        super::soft::compress(state, blocks);
+    }
+}
+
+#[target_feature(enable = "sha3")]
+unsafe fn sha512_compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
+    // SAFETY: Requires the sha3 feature.
+
+    // Load state into vectors.
+    let mut ab = vld1q_u64(state[0..2].as_ptr());
+    let mut cd = vld1q_u64(state[2..4].as_ptr());
+    let mut ef = vld1q_u64(state[4..6].as_ptr());
+    let mut gh = vld1q_u64(state[6..8].as_ptr());
+
+    // Iterate through the message blocks.
+    for block in blocks {
+        // Keep original state values.
+        let ab_orig = ab;
+        let cd_orig = cd;
+        let ef_orig = ef;
+        let gh_orig = gh;
+
+        // Load the message block into vectors, assuming little endianness.
+        let mut s0 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[0..16].as_ptr())));
+        let mut s1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[16..32].as_ptr())));
+        let mut s2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[32..48].as_ptr())));
+        let mut s3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[48..64].as_ptr())));
+        let mut s4 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[64..80].as_ptr())));
+        let mut s5 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[80..96].as_ptr())));
+        let mut s6 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[96..112].as_ptr())));
+        let mut s7 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[112..128].as_ptr())));
+
+        // Rounds 0 and 1
+        let mut initial_sum = vaddq_u64(s0, vld1q_u64(&K64[0]));
+        let mut sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
+        let mut intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
+        gh = vsha512h2q_u64(intermed, cd, ab);
+        cd = vaddq_u64(cd, intermed);
+
+        // Rounds 2 and 3
+        initial_sum = vaddq_u64(s1, vld1q_u64(&K64[2]));
+        sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
+        intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
+        ef = vsha512h2q_u64(intermed, ab, gh);
+        ab = vaddq_u64(ab, intermed);
+
+        // Rounds 4 and 5
+        initial_sum = vaddq_u64(s2, vld1q_u64(&K64[4]));
+        sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
+        intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
+        cd = vsha512h2q_u64(intermed, gh, ef);
+        gh = vaddq_u64(gh, intermed);
+
+        // Rounds 6 and 7
+        initial_sum = vaddq_u64(s3, vld1q_u64(&K64[6]));
+        sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
+        intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
+        ab = vsha512h2q_u64(intermed, ef, cd);
+        ef = vaddq_u64(ef, intermed);
+
+        // Rounds 8 and 9
+        initial_sum = vaddq_u64(s4, vld1q_u64(&K64[8]));
+        sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
+        intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
+        gh = vsha512h2q_u64(intermed, cd, ab);
+        cd = vaddq_u64(cd, intermed);
+
+        // Rounds 10 and 11
+        initial_sum = vaddq_u64(s5, vld1q_u64(&K64[10]));
+        sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
+        intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
+        ef = vsha512h2q_u64(intermed, ab, gh);
+        ab = vaddq_u64(ab, intermed);
+
+        // Rounds 12 and 13
+        initial_sum = vaddq_u64(s6, vld1q_u64(&K64[12]));
+        sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
+        intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
+        cd = vsha512h2q_u64(intermed, gh, ef);
+        gh = vaddq_u64(gh, intermed);
+
+        // Rounds 14 and 15
+        initial_sum = vaddq_u64(s7, vld1q_u64(&K64[14]));
+        sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
+        intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
+        ab = vsha512h2q_u64(intermed, ef, cd);
+        ef = vaddq_u64(ef, intermed);
+
+        for t in (16..80).step_by(16) {
+            // Rounds t and t + 1
+            s0 = vsha512su1q_u64(vsha512su0q_u64(s0, s1), s7, vextq_u64(s4, s5, 1));
+            initial_sum = vaddq_u64(s0, vld1q_u64(&K64[t]));
+            sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
+            intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
+            gh = vsha512h2q_u64(intermed, cd, ab);
+            cd = vaddq_u64(cd, intermed);
+
+            // Rounds t + 2 and t + 3
+            s1 = vsha512su1q_u64(vsha512su0q_u64(s1, s2), s0, vextq_u64(s5, s6, 1));
+            initial_sum = vaddq_u64(s1, vld1q_u64(&K64[t + 2]));
+            sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
+            intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
+            ef = vsha512h2q_u64(intermed, ab, gh);
+            ab = vaddq_u64(ab, intermed);
+
+            // Rounds t + 4 and t + 5
+            s2 = vsha512su1q_u64(vsha512su0q_u64(s2, s3), s1, vextq_u64(s6, s7, 1));
+            initial_sum = vaddq_u64(s2, vld1q_u64(&K64[t + 4]));
+            sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
+            intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
+            cd = vsha512h2q_u64(intermed, gh, ef);
+            gh = vaddq_u64(gh, intermed);
+
+            // Rounds t + 6 and t + 7
+            s3 = vsha512su1q_u64(vsha512su0q_u64(s3, s4), s2, vextq_u64(s7, s0, 1));
+            initial_sum = vaddq_u64(s3, vld1q_u64(&K64[t + 6]));
+            sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
+            intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
+            ab = vsha512h2q_u64(intermed, ef, cd);
+            ef = vaddq_u64(ef, intermed);
+
+            // Rounds t + 8 and t + 9
+            s4 = vsha512su1q_u64(vsha512su0q_u64(s4, s5), s3, vextq_u64(s0, s1, 1));
+            initial_sum = vaddq_u64(s4, vld1q_u64(&K64[t + 8]));
+            sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh);
+            intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
+            gh = vsha512h2q_u64(intermed, cd, ab);
+            cd = vaddq_u64(cd, intermed);
+
+            // Rounds t + 10 and t + 11
+            s5 = vsha512su1q_u64(vsha512su0q_u64(s5, s6), s4, vextq_u64(s1, s2, 1));
+            initial_sum = vaddq_u64(s5, vld1q_u64(&K64[t + 10]));
+            sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef);
+            intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
+            ef = vsha512h2q_u64(intermed, ab, gh);
+            ab = vaddq_u64(ab, intermed);
+
+            // Rounds t + 12 and t + 13
+            s6 = vsha512su1q_u64(vsha512su0q_u64(s6, s7), s5, vextq_u64(s2, s3, 1));
+            initial_sum = vaddq_u64(s6, vld1q_u64(&K64[t + 12]));
+            sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd);
+            intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
+            cd = vsha512h2q_u64(intermed, gh, ef);
+            gh = vaddq_u64(gh, intermed);
+
+            // Rounds t + 14 and t + 15
+            s7 = vsha512su1q_u64(vsha512su0q_u64(s7, s0), s6, vextq_u64(s3, s4, 1));
+            initial_sum = vaddq_u64(s7, vld1q_u64(&K64[t + 14]));
+            sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab);
+            intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
+            ab = vsha512h2q_u64(intermed, ef, cd);
+            ef = vaddq_u64(ef, intermed);
+        }
+
+        // Add the block-specific state to the original state.
+        ab = vaddq_u64(ab, ab_orig);
+        cd = vaddq_u64(cd, cd_orig);
+        ef = vaddq_u64(ef, ef_orig);
+        gh = vaddq_u64(gh, gh_orig);
+    }
+
+    // Store vectors into state.
+    vst1q_u64(state[0..2].as_mut_ptr(), ab);
+    vst1q_u64(state[2..4].as_mut_ptr(), cd);
+    vst1q_u64(state[4..6].as_mut_ptr(), ef);
+    vst1q_u64(state[6..8].as_mut_ptr(), gh);
+}
+
+// TODO remove these polyfills once SHA3 intrinsics land
+
+#[inline(always)]
+unsafe fn vsha512hq_u64(
+    mut hash_ed: uint64x2_t,
+    hash_gf: uint64x2_t,
+    kwh_kwh2: uint64x2_t,
+) -> uint64x2_t {
+    asm!(
+        "SHA512H {:q}, {:q}, {:v}.2D",
+        inout(vreg) hash_ed, in(vreg) hash_gf, in(vreg) kwh_kwh2,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    hash_ed
+}
+
+#[inline(always)]
+unsafe fn vsha512h2q_u64(
+    mut sum_ab: uint64x2_t,
+    hash_c_: uint64x2_t,
+    hash_ab: uint64x2_t,
+) -> uint64x2_t {
+    asm!(
+        "SHA512H2 {:q}, {:q}, {:v}.2D",
+        inout(vreg) sum_ab, in(vreg) hash_c_, in(vreg) hash_ab,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    sum_ab
+}
+
+#[inline(always)]
+unsafe fn vsha512su0q_u64(mut w0_1: uint64x2_t, w2_: uint64x2_t) -> uint64x2_t {
+    asm!(
+        "SHA512SU0 {:v}.2D, {:v}.2D",
+        inout(vreg) w0_1, in(vreg) w2_,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    w0_1
+}
+
+#[inline(always)]
+unsafe fn vsha512su1q_u64(
+    mut s01_s02: uint64x2_t,
+    w14_15: uint64x2_t,
+    w9_10: uint64x2_t,
+) -> uint64x2_t {
+    asm!(
+        "SHA512SU1 {:v}.2D, {:v}.2D, {:v}.2D",
+        inout(vreg) s01_s02, in(vreg) w14_15, in(vreg) w9_10,
+        options(pure, nomem, nostack, preserves_flags)
+    );
+    s01_s02
+}
-- 
cgit v1.2.3